{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.3022670025188917, "eval_steps": 500, "global_step": 480, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4658.0, "completions/max_terminated_length": 4658.0, "completions/mean_length": 2019.5833740234375, "completions/mean_terminated_length": 2019.5833740234375, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "entropy": 0.4210514649748802, "epoch": 0.0006297229219143577, "frac_reward_zero_std": 0.0, "grad_norm": 0.13319100967563474, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.2334, "num_tokens": 57734.0, "reward": 2.4166667461395264, "reward_std": 0.6178354024887085, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4177498817443848, "sampling/importance_sampling_ratio/mean": 0.9998350143432617, "sampling/importance_sampling_ratio/min": 0.6024585962295532, "sampling/sampling_logp_difference/max": 0.5067362785339355, "sampling/sampling_logp_difference/mean": 0.010630311444401741, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7135.0, "completions/mean_length": 3209.5, "completions/mean_terminated_length": 2992.86962890625, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.4489719718694687, "epoch": 0.0012594458438287153, "frac_reward_zero_std": 0.0, "grad_norm": 0.1053669238803597, "kl": 0.00010793787623697426, "learning_rate": 9.99999021550048e-07, "loss": -0.0576, "num_tokens": 151466.0, "reward": 1.7083333730697632, "reward_std": 0.7481458187103271, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.7756357192993164, "sampling/importance_sampling_ratio/mean": 0.9999839663505554, "sampling/importance_sampling_ratio/min": 0.6296979188919067, "sampling/sampling_logp_difference/max": 0.5741584300994873, "sampling/sampling_logp_difference/mean": 0.010660020634531975, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3930.0, "completions/max_terminated_length": 3930.0, "completions/mean_length": 2187.625, "completions/mean_terminated_length": 2187.625, "completions/min_length": 789.0, "completions/min_terminated_length": 789.0, "entropy": 0.3104456439614296, "epoch": 0.001889168765743073, "frac_reward_zero_std": 0.0, "grad_norm": 0.09770100859011285, "kl": 7.992511018528603e-05, "learning_rate": 9.999960862040218e-07, "loss": 0.0359, "num_tokens": 231001.0, "reward": 2.0416667461395264, "reward_std": 0.5461008548736572, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.664773941040039, "sampling/importance_sampling_ratio/mean": 1.0000864267349243, "sampling/importance_sampling_ratio/min": 0.5674299001693726, "sampling/sampling_logp_difference/max": 0.5666379928588867, "sampling/sampling_logp_difference/mean": 0.008346350863575935, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5762.0, "completions/max_terminated_length": 5762.0, "completions/mean_length": 2920.20849609375, "completions/mean_terminated_length": 2920.20849609375, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "entropy": 0.4726797565817833, "epoch": 0.0025188916876574307, "frac_reward_zero_std": 0.0, "grad_norm": 0.0930746209582487, "kl": 0.00011642166646197438, "learning_rate": 9.999911939734095e-07, "loss": 0.0114, "num_tokens": 316782.0, "reward": 2.125, "reward_std": 0.6380135416984558, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.3900724649429321, "sampling/importance_sampling_ratio/mean": 0.9998952746391296, "sampling/importance_sampling_ratio/min": 0.1799740493297577, "sampling/sampling_logp_difference/max": 1.7149426937103271, "sampling/sampling_logp_difference/mean": 0.011150488629937172, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6346.0, "completions/max_terminated_length": 6346.0, "completions/mean_length": 2413.875, "completions/mean_terminated_length": 2413.875, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.3316522538661957, "epoch": 0.0031486146095717885, "frac_reward_zero_std": 0.0, "grad_norm": 0.1384471916173281, "kl": 8.805103425402194e-05, "learning_rate": 9.999843448773583e-07, "loss": -0.1543, "num_tokens": 393947.0, "reward": 1.9583333730697632, "reward_std": 0.8528895378112793, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7888569831848145, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.3026273250579834, "sampling/sampling_logp_difference/max": 1.1952531337738037, "sampling/sampling_logp_difference/mean": 0.008920865133404732, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 4627.0, "completions/mean_length": 2678.08349609375, "completions/mean_terminated_length": 2176.818359375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.32714637368917465, "epoch": 0.003778337531486146, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10329187869270541, "kl": 8.567891200073063e-05, "learning_rate": 9.999755389426746e-07, "loss": 0.1653, "num_tokens": 476693.0, "reward": 2.25, "reward_std": 0.5443090200424194, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.7916666865348816, "rewards/format_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 1.8678808212280273, "sampling/importance_sampling_ratio/mean": 1.0000635385513306, "sampling/importance_sampling_ratio/min": 0.5644020438194275, "sampling/sampling_logp_difference/max": 0.6248044967651367, "sampling/sampling_logp_difference/mean": 0.008075688034296036, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5820.0, "completions/max_terminated_length": 5820.0, "completions/mean_length": 3389.33349609375, "completions/mean_terminated_length": 3389.33349609375, "completions/min_length": 1758.0, "completions/min_terminated_length": 1758.0, "entropy": 0.40796083956956863, "epoch": 0.004408060453400504, "frac_reward_zero_std": 0.0, "grad_norm": 0.0945745725626619, "kl": 0.00011977510075666942, "learning_rate": 9.999647762038227e-07, "loss": 0.0653, "num_tokens": 590893.0, "reward": 1.5416667461395264, "reward_std": 0.48371022939682007, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7651031017303467, "sampling/importance_sampling_ratio/mean": 0.999859631061554, "sampling/importance_sampling_ratio/min": 0.29632213711738586, "sampling/sampling_logp_difference/max": 1.2163081169128418, "sampling/sampling_logp_difference/mean": 0.010296512395143509, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5557.0, "completions/max_terminated_length": 5557.0, "completions/mean_length": 2832.791748046875, "completions/mean_terminated_length": 2832.791748046875, "completions/min_length": 946.0, "completions/min_terminated_length": 946.0, "entropy": 0.42847730219364166, "epoch": 0.005037783375314861, "frac_reward_zero_std": 0.0, "grad_norm": 0.10549648768793089, "kl": 0.00013063049482298084, "learning_rate": 9.999520567029256e-07, "loss": -0.0108, "num_tokens": 673064.0, "reward": 2.0833334922790527, "reward_std": 0.747992217540741, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998395442962646, "sampling/importance_sampling_ratio/min": 0.14722904562950134, "sampling/sampling_logp_difference/max": 1.9157657623291016, "sampling/sampling_logp_difference/mean": 0.010409778915345669, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4656.0, "completions/max_terminated_length": 4656.0, "completions/mean_length": 2399.70849609375, "completions/mean_terminated_length": 2399.70849609375, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.4456804469227791, "epoch": 0.005667506297229219, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08701360303306285, "kl": 0.00010597942309686914, "learning_rate": 9.999373804897653e-07, "loss": -0.0796, "num_tokens": 747153.0, "reward": 1.6666667461395264, "reward_std": 0.49179765582084656, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.4843926429748535, "sampling/importance_sampling_ratio/mean": 1.0001953840255737, "sampling/importance_sampling_ratio/min": 0.4879997670650482, "sampling/sampling_logp_difference/max": 0.7174403667449951, "sampling/sampling_logp_difference/mean": 0.010306030511856079, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8068.0, "completions/mean_length": 4736.2919921875, "completions/mean_terminated_length": 4586.04345703125, "completions/min_length": 2056.0, "completions/min_terminated_length": 2056.0, "entropy": 0.5212108120322227, "epoch": 0.006297229219143577, "frac_reward_zero_std": 0.0, "grad_norm": 0.084978749433485, "kl": 0.000153803270222852, "learning_rate": 9.999207476217814e-07, "loss": 0.0033, "num_tokens": 874600.0, "reward": 1.5, "reward_std": 0.6741900444030762, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6572247743606567, "sampling/importance_sampling_ratio/mean": 0.9998815059661865, "sampling/importance_sampling_ratio/min": 0.2305377721786499, "sampling/sampling_logp_difference/max": 1.4673405885696411, "sampling/sampling_logp_difference/mean": 0.012483730912208557, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5980.0, "completions/max_terminated_length": 5980.0, "completions/mean_length": 2365.33349609375, "completions/mean_terminated_length": 2365.33349609375, "completions/min_length": 886.0, "completions/min_terminated_length": 886.0, "entropy": 0.41194161027669907, "epoch": 0.0069269521410579345, "frac_reward_zero_std": 0.0, "grad_norm": 0.09945490894076871, "kl": 0.00011883280967595056, "learning_rate": 9.999021581640717e-07, "loss": -0.0301, "num_tokens": 945320.0, "reward": 2.1666667461395264, "reward_std": 0.6495786905288696, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434515476227, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4858659505844116, "sampling/importance_sampling_ratio/mean": 0.9997332692146301, "sampling/importance_sampling_ratio/min": 0.5559789538383484, "sampling/sampling_logp_difference/max": 0.5870249271392822, "sampling/sampling_logp_difference/mean": 0.01031479611992836, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5151.0, "completions/max_terminated_length": 5151.0, "completions/mean_length": 2777.916748046875, "completions/mean_terminated_length": 2777.916748046875, "completions/min_length": 1097.0, "completions/min_terminated_length": 1097.0, "entropy": 0.4417632073163986, "epoch": 0.007556675062972292, "frac_reward_zero_std": 0.0, "grad_norm": 0.10663174294163696, "kl": 0.0001484672393416986, "learning_rate": 9.998816121893914e-07, "loss": 0.0495, "num_tokens": 1035182.0, "reward": 1.7083333730697632, "reward_std": 0.580485463142395, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6745558977127075, "sampling/importance_sampling_ratio/mean": 0.9997747540473938, "sampling/importance_sampling_ratio/min": 0.491554856300354, "sampling/sampling_logp_difference/max": 0.710181713104248, "sampling/sampling_logp_difference/mean": 0.01105101965367794, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4587.0, "completions/max_terminated_length": 4587.0, "completions/mean_length": 2635.125, "completions/mean_terminated_length": 2635.125, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "entropy": 0.5015984550118446, "epoch": 0.00818639798488665, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09807734978111479, "kl": 0.0001381087404297432, "learning_rate": 9.998591097781535e-07, "loss": -0.0464, "num_tokens": 1109185.0, "reward": 1.625, "reward_std": 0.5573514699935913, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.0833333358168602, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5490024089813232, "sampling/importance_sampling_ratio/mean": 0.9999603629112244, "sampling/importance_sampling_ratio/min": 0.618939220905304, "sampling/sampling_logp_difference/max": 0.4797482490539551, "sampling/sampling_logp_difference/mean": 0.01117611862719059, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5041.0, "completions/mean_length": 2962.916748046875, "completions/mean_terminated_length": 2735.565185546875, "completions/min_length": 1342.0, "completions/min_terminated_length": 1342.0, "entropy": 0.49240633100271225, "epoch": 0.008816120906801008, "frac_reward_zero_std": 0.0, "grad_norm": 0.182305173768795, "kl": 0.00017398431009496562, "learning_rate": 9.998346510184278e-07, "loss": 0.1299, "num_tokens": 1194407.0, "reward": 1.7916667461395264, "reward_std": 0.6439208984375, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4271119832992554, "sampling/importance_sampling_ratio/mean": 1.0001176595687866, "sampling/importance_sampling_ratio/min": 0.5143397450447083, "sampling/sampling_logp_difference/max": 0.6648712158203125, "sampling/sampling_logp_difference/mean": 0.012051875703036785, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4076.0, "completions/max_terminated_length": 4076.0, "completions/mean_length": 2624.5, "completions/mean_terminated_length": 2624.5, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "entropy": 0.3751116469502449, "epoch": 0.009445843828715366, "frac_reward_zero_std": 0.0, "grad_norm": 0.10101883482778312, "kl": 0.00015486314441659488, "learning_rate": 9.998082360059415e-07, "loss": -0.0917, "num_tokens": 1297667.0, "reward": 1.7083333730697632, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4622596502304077, "sampling/importance_sampling_ratio/mean": 0.9999274611473083, "sampling/importance_sampling_ratio/min": 0.4899708330631256, "sampling/sampling_logp_difference/max": 0.713409423828125, "sampling/sampling_logp_difference/mean": 0.010084840469062328, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7455.0, "completions/max_terminated_length": 7455.0, "completions/mean_length": 2455.916748046875, "completions/mean_terminated_length": 2455.916748046875, "completions/min_length": 1345.0, "completions/min_terminated_length": 1345.0, "entropy": 0.4698982685804367, "epoch": 0.010075566750629723, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09127785786251096, "kl": 0.0001729320538288448, "learning_rate": 9.99779864844077e-07, "loss": 0.0446, "num_tokens": 1370001.0, "reward": 2.125, "reward_std": 0.4261821210384369, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.371492862701416, "sampling/importance_sampling_ratio/mean": 0.9998663067817688, "sampling/importance_sampling_ratio/min": 0.6139606237411499, "sampling/sampling_logp_difference/max": 0.4878244400024414, "sampling/sampling_logp_difference/mean": 0.0108026172965765, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5459.0, "completions/max_terminated_length": 5459.0, "completions/mean_length": 2405.416748046875, "completions/mean_terminated_length": 2405.416748046875, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "entropy": 0.5175765454769135, "epoch": 0.010705289672544081, "frac_reward_zero_std": 0.0, "grad_norm": 0.10731255412710579, "kl": 0.000174223769136006, "learning_rate": 9.997495376438735e-07, "loss": 0.0743, "num_tokens": 1447979.0, "reward": 1.7916667461395264, "reward_std": 0.5078567266464233, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.6359976530075073, "sampling/importance_sampling_ratio/mean": 0.9997338652610779, "sampling/importance_sampling_ratio/min": 0.4999285042285919, "sampling/sampling_logp_difference/max": 0.6932902336120605, "sampling/sampling_logp_difference/mean": 0.012671376578509808, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4960.0, "completions/max_terminated_length": 4960.0, "completions/mean_length": 2309.375, "completions/mean_terminated_length": 2309.375, "completions/min_length": 1162.0, "completions/min_terminated_length": 1162.0, "entropy": 0.2821624241769314, "epoch": 0.011335012594458438, "frac_reward_zero_std": 0.0, "grad_norm": 0.10265589470392167, "kl": 0.0001322977441304829, "learning_rate": 9.997172545240259e-07, "loss": 0.044, "num_tokens": 1530004.0, "reward": 2.125, "reward_std": 0.5480016469955444, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000059962272644, "sampling/importance_sampling_ratio/min": 0.5323473811149597, "sampling/sampling_logp_difference/max": 0.7202987670898438, "sampling/sampling_logp_difference/mean": 0.0074942223727703094, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3875.0, "completions/max_terminated_length": 3875.0, "completions/mean_length": 2271.291748046875, "completions/mean_terminated_length": 2271.291748046875, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "entropy": 0.33591075241565704, "epoch": 0.011964735516372796, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09928742828497389, "kl": 0.0001646234122745227, "learning_rate": 9.996830156108836e-07, "loss": 0.1043, "num_tokens": 1600811.0, "reward": 2.2083334922790527, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.84153151512146, "sampling/importance_sampling_ratio/mean": 1.0000792741775513, "sampling/importance_sampling_ratio/min": 0.667834997177124, "sampling/sampling_logp_difference/max": 0.6105976104736328, "sampling/sampling_logp_difference/mean": 0.008498751558363438, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7138.0, "completions/max_terminated_length": 7138.0, "completions/mean_length": 2944.75, "completions/mean_terminated_length": 2944.75, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "entropy": 0.47743383795022964, "epoch": 0.012594458438287154, "frac_reward_zero_std": 0.0, "grad_norm": 0.09010166738806653, "kl": 0.0001864601181296166, "learning_rate": 9.996468210384507e-07, "loss": -0.1017, "num_tokens": 1694501.0, "reward": 1.9583333730697632, "reward_std": 0.8751834630966187, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.876184105873108, "sampling/importance_sampling_ratio/mean": 1.0001300573349, "sampling/importance_sampling_ratio/min": 0.5433267951011658, "sampling/sampling_logp_difference/max": 0.6292400360107422, "sampling/sampling_logp_difference/mean": 0.010847898200154305, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3365.0, "completions/max_terminated_length": 3365.0, "completions/mean_length": 1636.5833740234375, "completions/mean_terminated_length": 1636.5833740234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "entropy": 0.37664221227169037, "epoch": 0.01322418136020151, "frac_reward_zero_std": 0.0, "grad_norm": 0.12299755156505661, "kl": 0.00016453092393931001, "learning_rate": 9.99608670948386e-07, "loss": -0.1609, "num_tokens": 1745355.0, "reward": 2.375, "reward_std": 0.721926212310791, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.3187103271484375, "sampling/importance_sampling_ratio/mean": 1.0000964403152466, "sampling/importance_sampling_ratio/min": 0.5351955890655518, "sampling/sampling_logp_difference/max": 0.6251230239868164, "sampling/sampling_logp_difference/mean": 0.009163275361061096, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6700.0, "completions/mean_length": 3540.416748046875, "completions/mean_terminated_length": 3117.545654296875, "completions/min_length": 958.0, "completions/min_terminated_length": 958.0, "entropy": 0.46891912817955017, "epoch": 0.013853904282115869, "frac_reward_zero_std": 0.0, "grad_norm": 0.10417092154000204, "kl": 0.00020557177776936442, "learning_rate": 9.99568565490001e-07, "loss": 0.0614, "num_tokens": 1844253.0, "reward": 2.125, "reward_std": 0.7765914797782898, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.7385443449020386, "sampling/importance_sampling_ratio/mean": 0.9999731183052063, "sampling/importance_sampling_ratio/min": 0.5094287991523743, "sampling/sampling_logp_difference/max": 0.6744651794433594, "sampling/sampling_logp_difference/mean": 0.01127721555531025, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4734.0, "completions/max_terminated_length": 4734.0, "completions/mean_length": 1857.916748046875, "completions/mean_terminated_length": 1857.916748046875, "completions/min_length": 611.0, "completions/min_terminated_length": 611.0, "entropy": 0.40257149934768677, "epoch": 0.014483627204030227, "frac_reward_zero_std": 0.0, "grad_norm": 0.11030464789670472, "kl": 0.00019082466315012425, "learning_rate": 9.995265048202603e-07, "loss": -0.0505, "num_tokens": 1897179.0, "reward": 1.875, "reward_std": 0.6705640554428101, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.4830541610717773, "sampling/importance_sampling_ratio/mean": 0.9997243881225586, "sampling/importance_sampling_ratio/min": 0.5894954800605774, "sampling/sampling_logp_difference/max": 0.5284881591796875, "sampling/sampling_logp_difference/mean": 0.009707729332149029, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3637.0, "completions/max_terminated_length": 3637.0, "completions/mean_length": 1516.75, "completions/mean_terminated_length": 1516.75, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "entropy": 0.3712960034608841, "epoch": 0.015113350125944584, "frac_reward_zero_std": 0.0, "grad_norm": 0.1437640960039758, "kl": 0.00020267261788831092, "learning_rate": 9.994824891037815e-07, "loss": -0.0774, "num_tokens": 1943333.0, "reward": 2.3333334922790527, "reward_std": 0.5858359336853027, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.8483697175979614, "sampling/importance_sampling_ratio/mean": 1.0000735521316528, "sampling/importance_sampling_ratio/min": 0.7362657189369202, "sampling/sampling_logp_difference/max": 0.6143040657043457, "sampling/sampling_logp_difference/mean": 0.008732253685593605, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2968.0, "completions/max_terminated_length": 2968.0, "completions/mean_length": 1773.916748046875, "completions/mean_terminated_length": 1773.916748046875, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.3279975801706314, "epoch": 0.015743073047858942, "frac_reward_zero_std": 0.0, "grad_norm": 0.0971762113049611, "kl": 0.0002006337308557704, "learning_rate": 9.994365185128325e-07, "loss": -0.1292, "num_tokens": 2004859.0, "reward": 2.4166667461395264, "reward_std": 0.8303279876708984, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.495943546295166, "sampling/importance_sampling_ratio/mean": 0.9999167323112488, "sampling/importance_sampling_ratio/min": 0.5456262230873108, "sampling/sampling_logp_difference/max": 0.6058211326599121, "sampling/sampling_logp_difference/mean": 0.008219756186008453, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4153.0, "completions/max_terminated_length": 4153.0, "completions/mean_length": 2130.041748046875, "completions/mean_terminated_length": 2130.041748046875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "entropy": 0.4601491689682007, "epoch": 0.0163727959697733, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09273094003174771, "kl": 0.00023669248912483454, "learning_rate": 9.993885932273336e-07, "loss": -0.0111, "num_tokens": 2068428.0, "reward": 1.7083333730697632, "reward_std": 0.5301506519317627, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997968673706055, "sampling/importance_sampling_ratio/min": 0.5744724273681641, "sampling/sampling_logp_difference/max": 0.7584476470947266, "sampling/sampling_logp_difference/mean": 0.010950549505650997, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7093.0, "completions/max_terminated_length": 7093.0, "completions/mean_length": 4092.666748046875, "completions/mean_terminated_length": 4092.666748046875, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "entropy": 0.43261653929948807, "epoch": 0.01700251889168766, "frac_reward_zero_std": 0.0, "grad_norm": 0.08126866361122292, "kl": 0.00020524336650851183, "learning_rate": 9.993387134348547e-07, "loss": 0.0415, "num_tokens": 2189372.0, "reward": 1.375, "reward_std": 0.622288167476654, "rewards/cloze_reward/mean": 0.25, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5542250871658325, "sampling/importance_sampling_ratio/mean": 0.999952495098114, "sampling/importance_sampling_ratio/min": 0.291919469833374, "sampling/sampling_logp_difference/max": 1.2312772274017334, "sampling/sampling_logp_difference/mean": 0.010431580245494843, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5932.0, "completions/max_terminated_length": 5932.0, "completions/mean_length": 2632.95849609375, "completions/mean_terminated_length": 2632.95849609375, "completions/min_length": 1165.0, "completions/min_terminated_length": 1165.0, "entropy": 0.3962131217122078, "epoch": 0.017632241813602016, "frac_reward_zero_std": 0.0, "grad_norm": 0.11255972502598811, "kl": 0.00020592461441992782, "learning_rate": 9.992868793306152e-07, "loss": -0.0138, "num_tokens": 2264859.0, "reward": 2.2083334922790527, "reward_std": 0.6985861659049988, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5260305404663086, "sampling/importance_sampling_ratio/mean": 0.9999988675117493, "sampling/importance_sampling_ratio/min": 0.6932854652404785, "sampling/sampling_logp_difference/max": 0.4226698875427246, "sampling/sampling_logp_difference/mean": 0.009381860494613647, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6667.0, "completions/max_terminated_length": 6667.0, "completions/mean_length": 3602.916748046875, "completions/mean_terminated_length": 3602.916748046875, "completions/min_length": 1848.0, "completions/min_terminated_length": 1848.0, "entropy": 0.45246804505586624, "epoch": 0.018261964735516372, "frac_reward_zero_std": 0.0, "grad_norm": 0.10156687412612625, "kl": 0.00032350883338949643, "learning_rate": 9.992330911174832e-07, "loss": -0.1772, "num_tokens": 2362369.0, "reward": 2.125, "reward_std": 0.7791786193847656, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000933408737183, "sampling/importance_sampling_ratio/min": 0.07298053056001663, "sampling/sampling_logp_difference/max": 2.6175625324249268, "sampling/sampling_logp_difference/mean": 0.011528318747878075, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7864.0, "completions/mean_length": 4017.83349609375, "completions/mean_terminated_length": 3836.347900390625, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.4932340234518051, "epoch": 0.018891687657430732, "frac_reward_zero_std": 0.0, "grad_norm": 0.09345090299210843, "kl": 0.0002540338864491787, "learning_rate": 9.991773490059754e-07, "loss": 0.2397, "num_tokens": 2480357.0, "reward": 1.5416667461395264, "reward_std": 0.6643140316009521, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.75, "rewards/format_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5581289529800415, "sampling/importance_sampling_ratio/mean": 0.9999625086784363, "sampling/importance_sampling_ratio/min": 0.0866241529583931, "sampling/sampling_logp_difference/max": 2.446176528930664, "sampling/sampling_logp_difference/mean": 0.012268203310668468, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3172.0, "completions/max_terminated_length": 3172.0, "completions/mean_length": 1271.541748046875, "completions/mean_terminated_length": 1271.541748046875, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "entropy": 0.30131806433200836, "epoch": 0.01952141057934509, "frac_reward_zero_std": 0.0, "grad_norm": 0.1323161234349343, "kl": 0.0002569614771346096, "learning_rate": 9.991196532142552e-07, "loss": -0.1414, "num_tokens": 2519530.0, "reward": 2.4583334922790527, "reward_std": 0.8409954905509949, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999778866767883, "sampling/importance_sampling_ratio/min": 0.6958978176116943, "sampling/sampling_logp_difference/max": 0.7334165573120117, "sampling/sampling_logp_difference/mean": 0.007927864789962769, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5448.0, "completions/max_terminated_length": 5448.0, "completions/mean_length": 2553.75, "completions/mean_terminated_length": 2553.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3753006011247635, "epoch": 0.020151133501259445, "frac_reward_zero_std": 0.0, "grad_norm": 0.09486353200921827, "kl": 0.0002419100965198595, "learning_rate": 9.990600039681321e-07, "loss": -0.1049, "num_tokens": 2591572.0, "reward": 2.0833334922790527, "reward_std": 0.8765602111816406, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5518146753311157, "sampling/importance_sampling_ratio/mean": 1.0000338554382324, "sampling/importance_sampling_ratio/min": 0.5173311829566956, "sampling/sampling_logp_difference/max": 0.6590719223022461, "sampling/sampling_logp_difference/mean": 0.00975863542407751, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6884.0, "completions/mean_length": 2840.75, "completions/mean_terminated_length": 2608.0869140625, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "entropy": 0.44207197427749634, "epoch": 0.020780856423173802, "frac_reward_zero_std": 0.0, "grad_norm": 0.11549699634697332, "kl": 0.000278278486803174, "learning_rate": 9.989984015010614e-07, "loss": -0.0587, "num_tokens": 2672918.0, "reward": 1.9583333730697632, "reward_std": 0.8013812899589539, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.9764635562896729, "sampling/importance_sampling_ratio/mean": 1.0000499486923218, "sampling/importance_sampling_ratio/min": 0.47234800457954407, "sampling/sampling_logp_difference/max": 0.750039279460907, "sampling/sampling_logp_difference/mean": 0.010895155370235443, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4938.0, "completions/max_terminated_length": 4938.0, "completions/mean_length": 2102.625, "completions/mean_terminated_length": 2102.625, "completions/min_length": 540.0, "completions/min_terminated_length": 540.0, "entropy": 0.26307209208607674, "epoch": 0.021410579345088162, "frac_reward_zero_std": 0.0, "grad_norm": 0.09255766776304643, "kl": 0.00022965222524362616, "learning_rate": 9.989348460541428e-07, "loss": -0.1255, "num_tokens": 2762229.0, "reward": 1.9583333730697632, "reward_std": 0.6036289930343628, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7975785732269287, "sampling/importance_sampling_ratio/mean": 1.0000602006912231, "sampling/importance_sampling_ratio/min": 0.4564056694507599, "sampling/sampling_logp_difference/max": 0.7843732833862305, "sampling/sampling_logp_difference/mean": 0.006846947129815817, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4691.0, "completions/max_terminated_length": 4691.0, "completions/mean_length": 2285.70849609375, "completions/mean_terminated_length": 2285.70849609375, "completions/min_length": 972.0, "completions/min_terminated_length": 972.0, "entropy": 0.34202149510383606, "epoch": 0.02204030226700252, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07804640445487586, "kl": 0.00025143203311017714, "learning_rate": 9.9886933787612e-07, "loss": 0.0242, "num_tokens": 2828150.0, "reward": 2.0833334922790527, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4551024436950684, "sampling/importance_sampling_ratio/mean": 0.9999803900718689, "sampling/importance_sampling_ratio/min": 0.3658035397529602, "sampling/sampling_logp_difference/max": 1.0056588649749756, "sampling/sampling_logp_difference/mean": 0.008085719309747219, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7606.0, "completions/mean_length": 4050.33349609375, "completions/mean_terminated_length": 3458.666748046875, "completions/min_length": 861.0, "completions/min_terminated_length": 861.0, "entropy": 0.4363689050078392, "epoch": 0.022670025188916875, "frac_reward_zero_std": 0.0, "grad_norm": 0.11722158600252196, "kl": 0.00029846944380551577, "learning_rate": 9.988018772233785e-07, "loss": 0.1397, "num_tokens": 2957614.0, "reward": 1.5416667461395264, "reward_std": 0.819232165813446, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.6361116170883179, "sampling/importance_sampling_ratio/mean": 1.0000721216201782, "sampling/importance_sampling_ratio/min": 0.008419197052717209, "sampling/sampling_logp_difference/max": 4.777240753173828, "sampling/sampling_logp_difference/mean": 0.011326688341796398, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6225.0, "completions/max_terminated_length": 6225.0, "completions/mean_length": 2729.666748046875, "completions/mean_terminated_length": 2729.666748046875, "completions/min_length": 943.0, "completions/min_terminated_length": 943.0, "entropy": 0.41258206218481064, "epoch": 0.023299748110831235, "frac_reward_zero_std": 0.0, "grad_norm": 0.09864029130766123, "kl": 0.0002975050010718405, "learning_rate": 9.987324643599457e-07, "loss": -0.0271, "num_tokens": 3042518.0, "reward": 2.0416667461395264, "reward_std": 0.5078567266464233, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.870772361755371, "sampling/importance_sampling_ratio/mean": 0.9998884797096252, "sampling/importance_sampling_ratio/min": 0.37831783294677734, "sampling/sampling_logp_difference/max": 0.9720206260681152, "sampling/sampling_logp_difference/mean": 0.010409670881927013, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5167.0, "completions/max_terminated_length": 5167.0, "completions/mean_length": 1854.2083740234375, "completions/mean_terminated_length": 1854.2083740234375, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "entropy": 0.4884159192442894, "epoch": 0.02392947103274559, "frac_reward_zero_std": 0.0, "grad_norm": 0.12422302943011132, "kl": 0.00040006462950259447, "learning_rate": 9.9866109955749e-07, "loss": 0.0436, "num_tokens": 3094803.0, "reward": 1.9166667461395264, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.491466999053955, "sampling/importance_sampling_ratio/mean": 1.0000016689300537, "sampling/importance_sampling_ratio/min": 0.6880128383636475, "sampling/sampling_logp_difference/max": 0.39976024627685547, "sampling/sampling_logp_difference/mean": 0.011871909722685814, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3579.0, "completions/max_terminated_length": 3579.0, "completions/mean_length": 2254.58349609375, "completions/mean_terminated_length": 2254.58349609375, "completions/min_length": 1112.0, "completions/min_terminated_length": 1112.0, "entropy": 0.32763805985450745, "epoch": 0.02455919395465995, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08080981280301183, "kl": 0.0002662780425453093, "learning_rate": 9.985877830953186e-07, "loss": -0.0236, "num_tokens": 3177537.0, "reward": 1.6666667461395264, "reward_std": 0.34503278136253357, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4685527086257935, "sampling/importance_sampling_ratio/mean": 1.000165581703186, "sampling/importance_sampling_ratio/min": 0.6056477427482605, "sampling/sampling_logp_difference/max": 0.5014567375183105, "sampling/sampling_logp_difference/mean": 0.008694658055901527, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6851.0, "completions/max_terminated_length": 6851.0, "completions/mean_length": 2564.375, "completions/mean_terminated_length": 2564.375, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.31522436439991, "epoch": 0.02518891687657431, "frac_reward_zero_std": 0.0, "grad_norm": 0.10334649652995383, "kl": 0.00034248296287842095, "learning_rate": 9.985125152603777e-07, "loss": -0.0363, "num_tokens": 3254658.0, "reward": 2.5416667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000043272972107, "sampling/importance_sampling_ratio/min": 0.615426778793335, "sampling/sampling_logp_difference/max": 0.7624187469482422, "sampling/sampling_logp_difference/mean": 0.008489835076034069, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 1647.041748046875, "completions/mean_terminated_length": 1362.478271484375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "entropy": 0.3139888495206833, "epoch": 0.025818639798488665, "frac_reward_zero_std": 0.0, "grad_norm": 0.15763768102478842, "kl": 0.00034860755840782076, "learning_rate": 9.984352963472507e-07, "loss": 0.3576, "num_tokens": 3303403.0, "reward": 2.2083334922790527, "reward_std": 0.6207113265991211, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.977181077003479, "sampling/importance_sampling_ratio/mean": 1.0001555681228638, "sampling/importance_sampling_ratio/min": 0.062018707394599915, "sampling/sampling_logp_difference/max": 2.7803192138671875, "sampling/sampling_logp_difference/mean": 0.008136974647641182, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3110.0, "completions/max_terminated_length": 3110.0, "completions/mean_length": 1715.5, "completions/mean_terminated_length": 1715.5, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 0.3279667869210243, "epoch": 0.02644836272040302, "frac_reward_zero_std": 0.0, "grad_norm": 0.10059420589879375, "kl": 0.000425611920945812, "learning_rate": 9.983561266581564e-07, "loss": 0.0328, "num_tokens": 3366135.0, "reward": 2.2083334922790527, "reward_std": 0.5914937257766724, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4475935697555542, "sampling/importance_sampling_ratio/mean": 1.0000622272491455, "sampling/importance_sampling_ratio/min": 0.43120118975639343, "sampling/sampling_logp_difference/max": 0.8411805629730225, "sampling/sampling_logp_difference/mean": 0.008496417663991451, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2685.0, "completions/max_terminated_length": 2685.0, "completions/mean_length": 1614.625, "completions/mean_terminated_length": 1614.625, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "entropy": 0.31438443064689636, "epoch": 0.02707808564231738, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.14626258096524927, "kl": 0.0004481686555664055, "learning_rate": 9.982750065029497e-07, "loss": -0.0969, "num_tokens": 3419422.0, "reward": 2.2916667461395264, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998164176940918, "sampling/importance_sampling_ratio/min": 0.6313685178756714, "sampling/sampling_logp_difference/max": 1.4382715225219727, "sampling/sampling_logp_difference/mean": 0.008314422331750393, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5741.0, "completions/max_terminated_length": 5741.0, "completions/mean_length": 2120.125, "completions/mean_terminated_length": 2120.125, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "entropy": 0.37023602426052094, "epoch": 0.027707808564231738, "frac_reward_zero_std": 0.0, "grad_norm": 0.10788947748707473, "kl": 0.00044036174222128466, "learning_rate": 9.981919361991182e-07, "loss": 0.0044, "num_tokens": 3481609.0, "reward": 2.375, "reward_std": 0.721926212310791, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.2855629920959473, "sampling/importance_sampling_ratio/mean": 1.0000948905944824, "sampling/importance_sampling_ratio/min": 0.5472345948219299, "sampling/sampling_logp_difference/max": 0.6028776168823242, "sampling/sampling_logp_difference/mean": 0.009188516065478325, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5481.0, "completions/max_terminated_length": 5481.0, "completions/mean_length": 2823.0, "completions/mean_terminated_length": 2823.0, "completions/min_length": 1303.0, "completions/min_terminated_length": 1303.0, "entropy": 0.43843183666467667, "epoch": 0.028337531486146095, "frac_reward_zero_std": 0.0, "grad_norm": 0.15112361992545542, "kl": 0.00036839770473307, "learning_rate": 9.981069160717828e-07, "loss": -0.0218, "num_tokens": 3563777.0, "reward": 1.6666667461395264, "reward_std": 0.6218420267105103, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.1666666716337204, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997805953025818, "sampling/importance_sampling_ratio/min": 0.21948625147342682, "sampling/sampling_logp_difference/max": 1.516465663909912, "sampling/sampling_logp_difference/mean": 0.011187676340341568, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3291.0, "completions/max_terminated_length": 3291.0, "completions/mean_length": 1319.9583740234375, "completions/mean_terminated_length": 1319.9583740234375, "completions/min_length": 576.0, "completions/min_terminated_length": 576.0, "entropy": 0.318659208714962, "epoch": 0.028967254408060455, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08612289181735308, "kl": 0.0004689020352088846, "learning_rate": 9.98019946453695e-07, "loss": 0.0098, "num_tokens": 3604328.0, "reward": 2.6666667461395264, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3260879516601562, "sampling/importance_sampling_ratio/mean": 0.9999516606330872, "sampling/importance_sampling_ratio/min": 0.7395709156990051, "sampling/sampling_logp_difference/max": 0.301685094833374, "sampling/sampling_logp_difference/mean": 0.007940899580717087, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4224.0, "completions/max_terminated_length": 4224.0, "completions/mean_length": 2412.541748046875, "completions/mean_terminated_length": 2412.541748046875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.5198383405804634, "epoch": 0.02959697732997481, "frac_reward_zero_std": 0.0, "grad_norm": 0.11819413127428616, "kl": 0.0005112206054036506, "learning_rate": 9.979310276852365e-07, "loss": -0.0398, "num_tokens": 3670885.0, "reward": 2.5416667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3129160404205322, "sampling/importance_sampling_ratio/mean": 0.9998650550842285, "sampling/importance_sampling_ratio/min": 0.6831293702125549, "sampling/sampling_logp_difference/max": 0.3810710906982422, "sampling/sampling_logp_difference/mean": 0.012378424406051636, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3327.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 1864.4583740234375, "completions/mean_terminated_length": 1864.4583740234375, "completions/min_length": 912.0, "completions/min_terminated_length": 912.0, "entropy": 0.36649373918771744, "epoch": 0.030226700251889168, "frac_reward_zero_std": 0.0, "grad_norm": 0.13759403700545966, "kl": 0.00040635126060806215, "learning_rate": 9.978401601144178e-07, "loss": -0.1101, "num_tokens": 3733496.0, "reward": 2.375, "reward_std": 0.6106518507003784, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4018584489822388, "sampling/importance_sampling_ratio/mean": 0.9999827742576599, "sampling/importance_sampling_ratio/min": 0.2845064699649811, "sampling/sampling_logp_difference/max": 1.2569992542266846, "sampling/sampling_logp_difference/mean": 0.008633526973426342, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6941.0, "completions/max_terminated_length": 6941.0, "completions/mean_length": 2693.33349609375, "completions/mean_terminated_length": 2693.33349609375, "completions/min_length": 569.0, "completions/min_terminated_length": 569.0, "entropy": 0.38997676223516464, "epoch": 0.030856423173803528, "frac_reward_zero_std": 0.0, "grad_norm": 0.09236740195132286, "kl": 0.00043198742787353694, "learning_rate": 9.977473440968762e-07, "loss": 0.0862, "num_tokens": 3813920.0, "reward": 2.3333334922790527, "reward_std": 0.5815500020980835, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7460371255874634, "sampling/importance_sampling_ratio/mean": 0.9998745918273926, "sampling/importance_sampling_ratio/min": 0.5455047488212585, "sampling/sampling_logp_difference/max": 0.606043815612793, "sampling/sampling_logp_difference/mean": 0.009938262403011322, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7530.0, "completions/max_terminated_length": 7530.0, "completions/mean_length": 1773.8333740234375, "completions/mean_terminated_length": 1773.8333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.351453572511673, "epoch": 0.031486146095717885, "frac_reward_zero_std": 0.0, "grad_norm": 0.13765493172060173, "kl": 0.0007580164965474978, "learning_rate": 9.976525799958749e-07, "loss": -0.3705, "num_tokens": 3865908.0, "reward": 2.0416667461395264, "reward_std": 0.7658529281616211, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7452871799468994, "sampling/importance_sampling_ratio/mean": 1.0000686645507812, "sampling/importance_sampling_ratio/min": 0.19733268022537231, "sampling/sampling_logp_difference/max": 1.6228642463684082, "sampling/sampling_logp_difference/mean": 0.009079648181796074, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5721.0, "completions/mean_length": 2675.0, "completions/mean_terminated_length": 2173.45458984375, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.49512071907520294, "epoch": 0.03211586901763224, "frac_reward_zero_std": 0.0, "grad_norm": 0.13436229178989167, "kl": 0.0005068464088253677, "learning_rate": 9.975558681823018e-07, "loss": 0.1336, "num_tokens": 3940100.0, "reward": 1.9166667461395264, "reward_std": 0.8569601774215698, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.504900336265564, "sampling/importance_sampling_ratio/mean": 0.9998403191566467, "sampling/importance_sampling_ratio/min": 0.48235660791397095, "sampling/sampling_logp_difference/max": 0.7290716171264648, "sampling/sampling_logp_difference/mean": 0.011550637893378735, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6525.0, "completions/mean_length": 2892.70849609375, "completions/mean_terminated_length": 2410.95458984375, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "entropy": 0.37002190947532654, "epoch": 0.0327455919395466, "frac_reward_zero_std": 0.0, "grad_norm": 0.11038757094676616, "kl": 0.00045385764678940177, "learning_rate": 9.974572090346674e-07, "loss": -0.0329, "num_tokens": 4018669.0, "reward": 1.7916667461395264, "reward_std": 0.8039863109588623, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.72950279712677, "sampling/importance_sampling_ratio/mean": 1.0001310110092163, "sampling/importance_sampling_ratio/min": 0.47296610474586487, "sampling/sampling_logp_difference/max": 0.7487316131591797, "sampling/sampling_logp_difference/mean": 0.009218277409672737, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 6468.0, "completions/mean_length": 4030.95849609375, "completions/mean_terminated_length": 3198.75, "completions/min_length": 1018.0, "completions/min_terminated_length": 1018.0, "entropy": 0.4586448147892952, "epoch": 0.033375314861460954, "frac_reward_zero_std": 0.0, "grad_norm": 0.10067552373353401, "kl": 0.0004195983929093927, "learning_rate": 9.97356602939104e-07, "loss": 0.1234, "num_tokens": 4131372.0, "reward": 1.6666667461395264, "reward_std": 0.7864987850189209, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.5756882429122925, "sampling/importance_sampling_ratio/mean": 0.9999602437019348, "sampling/importance_sampling_ratio/min": 0.5798970460891724, "sampling/sampling_logp_difference/max": 0.5449047088623047, "sampling/sampling_logp_difference/mean": 0.011181230656802654, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6814.0, "completions/max_terminated_length": 6814.0, "completions/mean_length": 2924.75, "completions/mean_terminated_length": 2924.75, "completions/min_length": 1124.0, "completions/min_terminated_length": 1124.0, "entropy": 0.4983374997973442, "epoch": 0.03400503778337532, "frac_reward_zero_std": 0.0, "grad_norm": 0.1216863152169037, "kl": 0.00047613376955268905, "learning_rate": 9.972540502893638e-07, "loss": 0.0261, "num_tokens": 4214774.0, "reward": 2.0833334922790527, "reward_std": 0.6804856061935425, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.463090419769287, "sampling/importance_sampling_ratio/mean": 1.0001989603042603, "sampling/importance_sampling_ratio/min": 0.4618614912033081, "sampling/sampling_logp_difference/max": 0.7724902629852295, "sampling/sampling_logp_difference/mean": 0.011950581334531307, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4937.0, "completions/max_terminated_length": 4937.0, "completions/mean_length": 2540.95849609375, "completions/mean_terminated_length": 2540.95849609375, "completions/min_length": 1032.0, "completions/min_terminated_length": 1032.0, "entropy": 0.29634784162044525, "epoch": 0.034634760705289674, "frac_reward_zero_std": 0.0, "grad_norm": 0.10724096763154597, "kl": 0.00041410480480408296, "learning_rate": 9.971495514868172e-07, "loss": 0.0267, "num_tokens": 4300261.0, "reward": 1.75, "reward_std": 0.6838971376419067, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.7341536283493042, "sampling/importance_sampling_ratio/mean": 1.0002702474594116, "sampling/importance_sampling_ratio/min": 0.5404850840568542, "sampling/sampling_logp_difference/max": 0.615288257598877, "sampling/sampling_logp_difference/mean": 0.007867313921451569, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3460.0, "completions/max_terminated_length": 3460.0, "completions/mean_length": 2223.875, "completions/mean_terminated_length": 2223.875, "completions/min_length": 828.0, "completions/min_terminated_length": 828.0, "entropy": 0.23776791244745255, "epoch": 0.03526448362720403, "frac_reward_zero_std": 0.0, "grad_norm": 0.09095645800444022, "kl": 0.00037343089206842706, "learning_rate": 9.970431069404516e-07, "loss": -0.087, "num_tokens": 4375434.0, "reward": 2.0, "reward_std": 0.8140539526939392, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.8077961206436157, "sampling/importance_sampling_ratio/mean": 1.000008463859558, "sampling/importance_sampling_ratio/min": 0.6188492178916931, "sampling/sampling_logp_difference/max": 0.5921084880828857, "sampling/sampling_logp_difference/mean": 0.006521804258227348, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4277.0, "completions/max_terminated_length": 4277.0, "completions/mean_length": 1589.8333740234375, "completions/mean_terminated_length": 1589.8333740234375, "completions/min_length": 713.0, "completions/min_terminated_length": 713.0, "entropy": 0.39004354178905487, "epoch": 0.03589420654911839, "frac_reward_zero_std": 0.0, "grad_norm": 0.14803262489286775, "kl": 0.0005313951915013604, "learning_rate": 9.969347170668696e-07, "loss": 0.1071, "num_tokens": 4423958.0, "reward": 2.0416667461395264, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3141255378723145, "sampling/importance_sampling_ratio/mean": 0.9998435974121094, "sampling/importance_sampling_ratio/min": 0.6812120676040649, "sampling/sampling_logp_difference/max": 0.38388168811798096, "sampling/sampling_logp_difference/mean": 0.009660131298005581, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 5941.0, "completions/mean_length": 3246.58349609375, "completions/mean_terminated_length": 2540.09521484375, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "entropy": 0.42667340487241745, "epoch": 0.036523929471032744, "frac_reward_zero_std": 0.0, "grad_norm": 0.10152694236523854, "kl": 0.00046938483137637377, "learning_rate": 9.968243822902878e-07, "loss": 0.063, "num_tokens": 4513940.0, "reward": 1.8333333730697632, "reward_std": 0.7864987850189209, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.9890210628509521, "sampling/importance_sampling_ratio/mean": 1.0002037286758423, "sampling/importance_sampling_ratio/min": 0.6382823586463928, "sampling/sampling_logp_difference/max": 0.6876425743103027, "sampling/sampling_logp_difference/mean": 0.00989213865250349, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5338.0, "completions/max_terminated_length": 5338.0, "completions/mean_length": 1631.041748046875, "completions/mean_terminated_length": 1631.041748046875, "completions/min_length": 601.0, "completions/min_terminated_length": 601.0, "entropy": 0.3424348160624504, "epoch": 0.0371536523929471, "frac_reward_zero_std": 0.0, "grad_norm": 0.12781160577520104, "kl": 0.0005242961051408201, "learning_rate": 9.967121030425339e-07, "loss": 0.1497, "num_tokens": 4566989.0, "reward": 2.625, "reward_std": 0.580485463142395, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5741482973098755, "sampling/importance_sampling_ratio/mean": 0.999905526638031, "sampling/importance_sampling_ratio/min": 0.6999139189720154, "sampling/sampling_logp_difference/max": 0.45371437072753906, "sampling/sampling_logp_difference/mean": 0.00898975133895874, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6768.0, "completions/max_terminated_length": 6768.0, "completions/mean_length": 2048.875, "completions/mean_terminated_length": 2048.875, "completions/min_length": 1250.0, "completions/min_terminated_length": 1250.0, "entropy": 0.34162139892578125, "epoch": 0.037783375314861464, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09987824101631107, "kl": 0.0005094236548757181, "learning_rate": 9.965978797630468e-07, "loss": 0.1006, "num_tokens": 4625882.0, "reward": 2.375, "reward_std": 0.4244926869869232, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4302642345428467, "sampling/importance_sampling_ratio/mean": 1.0002514123916626, "sampling/importance_sampling_ratio/min": 0.4340468645095825, "sampling/sampling_logp_difference/max": 0.8346028327941895, "sampling/sampling_logp_difference/mean": 0.009162995032966137, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5157.0, "completions/max_terminated_length": 5157.0, "completions/mean_length": 2156.916748046875, "completions/mean_terminated_length": 2156.916748046875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.4054840952157974, "epoch": 0.03841309823677582, "frac_reward_zero_std": 0.0, "grad_norm": 0.10507590479698285, "kl": 0.0005396807755460031, "learning_rate": 9.964817128988733e-07, "loss": 0.0143, "num_tokens": 4693248.0, "reward": 2.0416667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5840328931808472, "sampling/importance_sampling_ratio/mean": 1.0000195503234863, "sampling/importance_sampling_ratio/min": 0.4244614839553833, "sampling/sampling_logp_difference/max": 0.8569340705871582, "sampling/sampling_logp_difference/mean": 0.00990035105496645, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5066.0, "completions/max_terminated_length": 5066.0, "completions/mean_length": 2393.791748046875, "completions/mean_terminated_length": 2393.791748046875, "completions/min_length": 1123.0, "completions/min_terminated_length": 1123.0, "entropy": 0.3531186506152153, "epoch": 0.03904282115869018, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09418837524207836, "kl": 0.00046933674457250163, "learning_rate": 9.963636029046674e-07, "loss": -0.1465, "num_tokens": 4764059.0, "reward": 1.8333333730697632, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5958549976348877, "sampling/importance_sampling_ratio/mean": 0.9998967051506042, "sampling/importance_sampling_ratio/min": 0.3997570872306824, "sampling/sampling_logp_difference/max": 0.916898250579834, "sampling/sampling_logp_difference/mean": 0.008721103891730309, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4325.0, "completions/max_terminated_length": 4325.0, "completions/mean_length": 2510.25, "completions/mean_terminated_length": 2510.25, "completions/min_length": 751.0, "completions/min_terminated_length": 751.0, "entropy": 0.4050501585006714, "epoch": 0.039672544080604534, "frac_reward_zero_std": 0.0, "grad_norm": 0.10757247975284855, "kl": 0.000530352619534824, "learning_rate": 9.96243550242688e-07, "loss": -0.0274, "num_tokens": 4859561.0, "reward": 2.1666667461395264, "reward_std": 0.7317181825637817, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6597645282745361, "sampling/importance_sampling_ratio/mean": 0.9999773502349854, "sampling/importance_sampling_ratio/min": 0.7137321829795837, "sampling/sampling_logp_difference/max": 0.5066757202148438, "sampling/sampling_logp_difference/mean": 0.009516916237771511, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2932.0, "completions/max_terminated_length": 2932.0, "completions/mean_length": 1563.25, "completions/mean_terminated_length": 1563.25, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "entropy": 0.27534880861639977, "epoch": 0.04030226700251889, "frac_reward_zero_std": 0.0, "grad_norm": 0.11432734216609192, "kl": 0.00044913776218891144, "learning_rate": 9.961215553827969e-07, "loss": 0.0193, "num_tokens": 4906239.0, "reward": 2.4166667461395264, "reward_std": 0.6410841941833496, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4805097579956055, "sampling/importance_sampling_ratio/mean": 0.9999955296516418, "sampling/importance_sampling_ratio/min": 0.4797734320163727, "sampling/sampling_logp_difference/max": 0.7344412803649902, "sampling/sampling_logp_difference/mean": 0.006728786043822765, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3757.0, "completions/max_terminated_length": 3757.0, "completions/mean_length": 1718.791748046875, "completions/mean_terminated_length": 1718.791748046875, "completions/min_length": 954.0, "completions/min_terminated_length": 954.0, "entropy": 0.3746664524078369, "epoch": 0.04093198992443325, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09207506931814506, "kl": 0.0005126752585056238, "learning_rate": 9.95997618802458e-07, "loss": -0.0312, "num_tokens": 4956698.0, "reward": 2.5833334922790527, "reward_std": 0.41387641429901123, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4070683717727661, "sampling/importance_sampling_ratio/mean": 1.000154972076416, "sampling/importance_sampling_ratio/min": 0.6983901858329773, "sampling/sampling_logp_difference/max": 0.3589773178100586, "sampling/sampling_logp_difference/mean": 0.009151982143521309, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4635.0, "completions/max_terminated_length": 4635.0, "completions/mean_length": 2060.166748046875, "completions/mean_terminated_length": 2060.166748046875, "completions/min_length": 822.0, "completions/min_terminated_length": 822.0, "entropy": 0.21687553822994232, "epoch": 0.041561712846347604, "frac_reward_zero_std": 0.0, "grad_norm": 0.09335391521704145, "kl": 0.00048785527906147763, "learning_rate": 9.95871740986734e-07, "loss": 0.081, "num_tokens": 5023750.0, "reward": 2.5, "reward_std": 0.5671766996383667, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000704526901245, "sampling/importance_sampling_ratio/min": 0.5396162271499634, "sampling/sampling_logp_difference/max": 0.8718996047973633, "sampling/sampling_logp_difference/mean": 0.005851762369275093, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5989.0, "completions/max_terminated_length": 5989.0, "completions/mean_length": 3391.166748046875, "completions/mean_terminated_length": 3391.166748046875, "completions/min_length": 1537.0, "completions/min_terminated_length": 1537.0, "entropy": 0.5156021639704704, "epoch": 0.04219143576826197, "frac_reward_zero_std": 0.0, "grad_norm": 0.10235734640148458, "kl": 0.000580199935939163, "learning_rate": 9.957439224282854e-07, "loss": -0.0898, "num_tokens": 5121370.0, "reward": 1.9166667461395264, "reward_std": 0.6467158794403076, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5061092376708984, "sampling/importance_sampling_ratio/mean": 1.000134825706482, "sampling/importance_sampling_ratio/min": 0.6068882942199707, "sampling/sampling_logp_difference/max": 0.4994105100631714, "sampling/sampling_logp_difference/mean": 0.01208382099866867, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7872.0, "completions/mean_length": 3056.666748046875, "completions/mean_terminated_length": 2833.391357421875, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "entropy": 0.4604324847459793, "epoch": 0.042821158690176324, "frac_reward_zero_std": 0.0, "grad_norm": 0.11337309879224916, "kl": 0.0005867292638868093, "learning_rate": 9.956141636273688e-07, "loss": 0.0331, "num_tokens": 5204650.0, "reward": 2.0416667461395264, "reward_std": 0.7396931052207947, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000182032585144, "sampling/importance_sampling_ratio/min": 0.5678019523620605, "sampling/sampling_logp_difference/max": 0.8366403579711914, "sampling/sampling_logp_difference/mean": 0.01156303845345974, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7004.0, "completions/max_terminated_length": 7004.0, "completions/mean_length": 3018.791748046875, "completions/mean_terminated_length": 3018.791748046875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "entropy": 0.32892755419015884, "epoch": 0.04345088161209068, "frac_reward_zero_std": 0.0, "grad_norm": 0.09388914181401084, "kl": 0.0005324802405084483, "learning_rate": 9.954824650918337e-07, "loss": -0.0687, "num_tokens": 5297397.0, "reward": 1.9583333730697632, "reward_std": 0.5566146373748779, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.669889211654663, "sampling/importance_sampling_ratio/mean": 0.9998946785926819, "sampling/importance_sampling_ratio/min": 0.2800692617893219, "sampling/sampling_logp_difference/max": 1.2727183103561401, "sampling/sampling_logp_difference/mean": 0.008645528927445412, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6901.0, "completions/mean_length": 3480.95849609375, "completions/mean_terminated_length": 3052.681884765625, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "entropy": 0.3960796892642975, "epoch": 0.04408060453400504, "frac_reward_zero_std": 0.0, "grad_norm": 0.09789598047815735, "kl": 0.000535642982868012, "learning_rate": 9.95348827337122e-07, "loss": 0.0862, "num_tokens": 5394300.0, "reward": 1.9583333730697632, "reward_std": 0.6439208984375, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.6934921741485596, "sampling/importance_sampling_ratio/mean": 1.0000340938568115, "sampling/importance_sampling_ratio/min": 0.49482038617134094, "sampling/sampling_logp_difference/max": 0.703560471534729, "sampling/sampling_logp_difference/mean": 0.009929805994033813, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4453.0, "completions/mean_length": 2653.166748046875, "completions/mean_terminated_length": 2412.347900390625, "completions/min_length": 915.0, "completions/min_terminated_length": 915.0, "entropy": 0.3814065009355545, "epoch": 0.044710327455919394, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1148527091244411, "kl": 0.0005678123270627111, "learning_rate": 9.952132508862652e-07, "loss": 0.127, "num_tokens": 5468112.0, "reward": 2.2083334922790527, "reward_std": 0.5867810249328613, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.649259090423584, "sampling/importance_sampling_ratio/mean": 0.9999160766601562, "sampling/importance_sampling_ratio/min": 0.5244871973991394, "sampling/sampling_logp_difference/max": 0.6453342437744141, "sampling/sampling_logp_difference/mean": 0.009795396588742733, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4073.0, "completions/max_terminated_length": 4073.0, "completions/mean_length": 2206.0, "completions/mean_terminated_length": 2206.0, "completions/min_length": 778.0, "completions/min_terminated_length": 778.0, "entropy": 0.5162052288651466, "epoch": 0.04534005037783375, "frac_reward_zero_std": 0.0, "grad_norm": 0.12607385183495892, "kl": 0.0006568302487721667, "learning_rate": 9.950757362698826e-07, "loss": -0.0033, "num_tokens": 5531336.0, "reward": 2.625, "reward_std": 0.5480016469955444, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3272823095321655, "sampling/importance_sampling_ratio/mean": 0.9996255040168762, "sampling/importance_sampling_ratio/min": 0.6717764139175415, "sampling/sampling_logp_difference/max": 0.3978297710418701, "sampling/sampling_logp_difference/mean": 0.011835867539048195, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3957.0, "completions/max_terminated_length": 3957.0, "completions/mean_length": 1960.5, "completions/mean_terminated_length": 1960.5, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 0.29671207070350647, "epoch": 0.045969773299748114, "frac_reward_zero_std": 0.0, "grad_norm": 0.10917499510372347, "kl": 0.0006367346359184012, "learning_rate": 9.949362840261783e-07, "loss": -0.0404, "num_tokens": 5596132.0, "reward": 2.2916667461395264, "reward_std": 0.6592972278594971, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5159894227981567, "sampling/importance_sampling_ratio/mean": 1.0000884532928467, "sampling/importance_sampling_ratio/min": 0.6527342796325684, "sampling/sampling_logp_difference/max": 0.42658519744873047, "sampling/sampling_logp_difference/mean": 0.007639448158442974, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4980.0, "completions/max_terminated_length": 4980.0, "completions/mean_length": 2328.541748046875, "completions/mean_terminated_length": 2328.541748046875, "completions/min_length": 1091.0, "completions/min_terminated_length": 1091.0, "entropy": 0.44034674763679504, "epoch": 0.04659949622166247, "frac_reward_zero_std": 0.0, "grad_norm": 0.10908056186652516, "kl": 0.0005699600878870115, "learning_rate": 9.94794894700941e-07, "loss": -0.0961, "num_tokens": 5662265.0, "reward": 2.375, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4186866283416748, "sampling/importance_sampling_ratio/mean": 1.000102162361145, "sampling/importance_sampling_ratio/min": 0.6987900733947754, "sampling/sampling_logp_difference/max": 0.35840487480163574, "sampling/sampling_logp_difference/mean": 0.00992572121322155, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3864.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 2150.291748046875, "completions/mean_terminated_length": 2150.291748046875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.46057599782943726, "epoch": 0.04722921914357683, "frac_reward_zero_std": 0.0, "grad_norm": 0.11404778453906618, "kl": 0.0005691817495971918, "learning_rate": 9.9465156884754e-07, "loss": 0.0883, "num_tokens": 5725040.0, "reward": 1.75, "reward_std": 0.6969282627105713, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.487659215927124, "sampling/importance_sampling_ratio/mean": 1.00015389919281, "sampling/importance_sampling_ratio/min": 0.5369391441345215, "sampling/sampling_logp_difference/max": 0.6218705177307129, "sampling/sampling_logp_difference/mean": 0.009705870412290096, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1934.0, "completions/max_terminated_length": 1934.0, "completions/mean_length": 1316.666748046875, "completions/mean_terminated_length": 1316.666748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.24472882971167564, "epoch": 0.04785894206549118, "frac_reward_zero_std": 0.0, "grad_norm": 0.10041139721905812, "kl": 0.0038392629357986152, "learning_rate": 9.945063070269237e-07, "loss": -0.0972, "num_tokens": 5765896.0, "reward": 2.5, "reward_std": 0.6803731322288513, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5297679901123047, "sampling/importance_sampling_ratio/mean": 0.9999176859855652, "sampling/importance_sampling_ratio/min": 0.5922755002975464, "sampling/sampling_logp_difference/max": 0.5237834453582764, "sampling/sampling_logp_difference/mean": 0.00639535766094923, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6975.0, "completions/max_terminated_length": 6975.0, "completions/mean_length": 2060.45849609375, "completions/mean_terminated_length": 2060.45849609375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "entropy": 0.3591771349310875, "epoch": 0.04848866498740554, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09853054266787299, "kl": 0.0006394037045538425, "learning_rate": 9.943591098076183e-07, "loss": 0.0599, "num_tokens": 5824275.0, "reward": 2.1666667461395264, "reward_std": 0.4497717618942261, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3478755950927734, "sampling/importance_sampling_ratio/mean": 1.0000452995300293, "sampling/importance_sampling_ratio/min": 0.6271273493766785, "sampling/sampling_logp_difference/max": 0.46660566329956055, "sampling/sampling_logp_difference/mean": 0.009488197974860668, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8019.0, "completions/mean_length": 3225.416748046875, "completions/mean_terminated_length": 3009.478271484375, "completions/min_length": 1024.0, "completions/min_terminated_length": 1024.0, "entropy": 0.3118775859475136, "epoch": 0.0491183879093199, "frac_reward_zero_std": 0.0, "grad_norm": 0.10511279160035333, "kl": 0.0005353014494176023, "learning_rate": 9.94209977765724e-07, "loss": 0.0642, "num_tokens": 5936557.0, "reward": 1.75, "reward_std": 0.6159346103668213, "rewards/cloze_reward/mean": 0.25, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.767826795578003, "sampling/importance_sampling_ratio/mean": 0.9999815821647644, "sampling/importance_sampling_ratio/min": 0.5565706491470337, "sampling/sampling_logp_difference/max": 0.585961103439331, "sampling/sampling_logp_difference/mean": 0.008131339214742184, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5760.0, "completions/max_terminated_length": 5760.0, "completions/mean_length": 1640.25, "completions/mean_terminated_length": 1640.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.35863303393125534, "epoch": 0.04974811083123426, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09868547185612409, "kl": 0.006926886941073462, "learning_rate": 9.94058911484914e-07, "loss": -0.0868, "num_tokens": 5985515.0, "reward": 1.5416667461395264, "reward_std": 0.48464712500572205, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.4541382789611816, "sampling/importance_sampling_ratio/mean": 1.000003457069397, "sampling/importance_sampling_ratio/min": 0.6579961180686951, "sampling/sampling_logp_difference/max": 0.41855621337890625, "sampling/sampling_logp_difference/mean": 0.009228984825313091, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5833.0, "completions/max_terminated_length": 5833.0, "completions/mean_length": 1878.5833740234375, "completions/mean_terminated_length": 1878.5833740234375, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.4619139954447746, "epoch": 0.05037783375314862, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11451161262202007, "kl": 0.0007760575244901702, "learning_rate": 9.939059115564308e-07, "loss": 0.0704, "num_tokens": 6038513.0, "reward": 1.7916667461395264, "reward_std": 0.4082186222076416, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.0416666679084301, "rewards/code_reward/std": 0.20412413775920868, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.3850724697113037, "sampling/importance_sampling_ratio/mean": 1.000027060508728, "sampling/importance_sampling_ratio/min": 6.968940624574316e-08, "sampling/sampling_logp_difference/max": 16.479217529296875, "sampling/sampling_logp_difference/mean": 0.011271677911281586, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5797.0, "completions/max_terminated_length": 5797.0, "completions/mean_length": 1936.3333740234375, "completions/mean_terminated_length": 1936.3333740234375, "completions/min_length": 758.0, "completions/min_terminated_length": 758.0, "entropy": 0.2565155550837517, "epoch": 0.05100755667506297, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09577126168903811, "kl": 0.0006315442005870864, "learning_rate": 9.937509785790863e-07, "loss": -0.0724, "num_tokens": 6109025.0, "reward": 1.8333333730697632, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5804327726364136, "sampling/importance_sampling_ratio/mean": 0.9997892379760742, "sampling/importance_sampling_ratio/min": 0.4897601306438446, "sampling/sampling_logp_difference/max": 0.7138395309448242, "sampling/sampling_logp_difference/mean": 0.006899207830429077, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2489.0, "completions/max_terminated_length": 2489.0, "completions/mean_length": 1547.416748046875, "completions/mean_terminated_length": 1547.416748046875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.3061675876379013, "epoch": 0.05163727959697733, "frac_reward_zero_std": 0.0, "grad_norm": 0.11418829615552499, "kl": 0.0007893179717939347, "learning_rate": 9.935941131592568e-07, "loss": 0.0196, "num_tokens": 6158443.0, "reward": 2.0833334922790527, "reward_std": 0.7617236971855164, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9025863409042358, "sampling/importance_sampling_ratio/mean": 0.9998946189880371, "sampling/importance_sampling_ratio/min": 0.6942810416221619, "sampling/sampling_logp_difference/max": 0.643214225769043, "sampling/sampling_logp_difference/mean": 0.007694135420024395, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5666.0, "completions/max_terminated_length": 5666.0, "completions/mean_length": 2628.416748046875, "completions/mean_terminated_length": 2628.416748046875, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "entropy": 0.34959229826927185, "epoch": 0.052267002518891686, "frac_reward_zero_std": 0.0, "grad_norm": 0.1083029696716491, "kl": 0.0007318278803722933, "learning_rate": 9.93435315910882e-07, "loss": 0.0003, "num_tokens": 6240237.0, "reward": 1.7916667461395264, "reward_std": 0.6380135416984558, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4637458324432373, "sampling/importance_sampling_ratio/mean": 1.0000331401824951, "sampling/importance_sampling_ratio/min": 0.30795514583587646, "sampling/sampling_logp_difference/max": 1.1778011322021484, "sampling/sampling_logp_difference/mean": 0.008969496004283428, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4946.0, "completions/max_terminated_length": 4946.0, "completions/mean_length": 2006.3333740234375, "completions/mean_terminated_length": 2006.3333740234375, "completions/min_length": 1062.0, "completions/min_terminated_length": 1062.0, "entropy": 0.4032848849892616, "epoch": 0.05289672544080604, "frac_reward_zero_std": 0.0, "grad_norm": 0.13998117576253338, "kl": 0.00075358989124652, "learning_rate": 9.932745874554629e-07, "loss": -0.1645, "num_tokens": 6301797.0, "reward": 2.2083334922790527, "reward_std": 0.6199029684066772, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3031971454620361, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.6144919395446777, "sampling/sampling_logp_difference/max": 0.48695945739746094, "sampling/sampling_logp_difference/mean": 0.009588050656020641, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5250.0, "completions/max_terminated_length": 5250.0, "completions/mean_length": 1982.666748046875, "completions/mean_terminated_length": 1982.666748046875, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "entropy": 0.28628499433398247, "epoch": 0.0535264483627204, "frac_reward_zero_std": 0.0, "grad_norm": 0.11495387315431888, "kl": 0.000775712585891597, "learning_rate": 9.931119284220582e-07, "loss": 0.0814, "num_tokens": 6367453.0, "reward": 2.5416667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6285258531570435, "sampling/importance_sampling_ratio/mean": 1.0000699758529663, "sampling/importance_sampling_ratio/min": 0.613396167755127, "sampling/sampling_logp_difference/max": 0.48874425888061523, "sampling/sampling_logp_difference/mean": 0.00758761540055275, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7697.0, "completions/max_terminated_length": 7697.0, "completions/mean_length": 3685.58349609375, "completions/mean_terminated_length": 3685.58349609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3706379160284996, "epoch": 0.05415617128463476, "frac_reward_zero_std": 0.0, "grad_norm": 0.08109774233186637, "kl": 0.01561154866067227, "learning_rate": 9.929473394472829e-07, "loss": -0.2836, "num_tokens": 6475451.0, "reward": 2.125, "reward_std": 0.8563064336776733, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.597758173942566, "sampling/importance_sampling_ratio/mean": 1.000095248222351, "sampling/importance_sampling_ratio/min": 0.39956116676330566, "sampling/sampling_logp_difference/max": 0.9173884391784668, "sampling/sampling_logp_difference/mean": 0.010181929916143417, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3055.0, "completions/max_terminated_length": 3055.0, "completions/mean_length": 1898.291748046875, "completions/mean_terminated_length": 1898.291748046875, "completions/min_length": 693.0, "completions/min_terminated_length": 693.0, "entropy": 0.27456779032945633, "epoch": 0.05478589420654912, "frac_reward_zero_std": 0.0, "grad_norm": 0.10020173587006005, "kl": 0.0008661302417749539, "learning_rate": 9.927808211753052e-07, "loss": -0.0562, "num_tokens": 6539810.0, "reward": 2.375, "reward_std": 0.7481457591056824, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6179449558258057, "sampling/importance_sampling_ratio/mean": 1.000178337097168, "sampling/importance_sampling_ratio/min": 0.7406175136566162, "sampling/sampling_logp_difference/max": 0.4811568260192871, "sampling/sampling_logp_difference/mean": 0.007388257887214422, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5040.0, "completions/max_terminated_length": 5040.0, "completions/mean_length": 1734.875, "completions/mean_terminated_length": 1734.875, "completions/min_length": 644.0, "completions/min_terminated_length": 644.0, "entropy": 0.3091536574065685, "epoch": 0.055415617128463476, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10248990888817851, "kl": 0.000851440490805544, "learning_rate": 9.926123742578445e-07, "loss": -0.087, "num_tokens": 6592559.0, "reward": 2.5833334922790527, "reward_std": 0.4629100561141968, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4509674310684204, "sampling/importance_sampling_ratio/mean": 1.000031590461731, "sampling/importance_sampling_ratio/min": 0.6575571894645691, "sampling/sampling_logp_difference/max": 0.4192235469818115, "sampling/sampling_logp_difference/mean": 0.008407294750213623, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7716.0, "completions/max_terminated_length": 7716.0, "completions/mean_length": 3015.791748046875, "completions/mean_terminated_length": 3015.791748046875, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "entropy": 0.5310604050755501, "epoch": 0.05604534005037783, "frac_reward_zero_std": 0.0, "grad_norm": 0.08764031023064216, "kl": 0.0008435246854787692, "learning_rate": 9.924419993541682e-07, "loss": -0.1493, "num_tokens": 6677666.0, "reward": 1.9583333730697632, "reward_std": 0.7440237998962402, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434515476227, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000193119049072, "sampling/importance_sampling_ratio/min": 0.5156979560852051, "sampling/sampling_logp_difference/max": 0.7608542442321777, "sampling/sampling_logp_difference/mean": 0.009924042969942093, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3595.0, "completions/max_terminated_length": 3595.0, "completions/mean_length": 1963.791748046875, "completions/mean_terminated_length": 1963.791748046875, "completions/min_length": 1162.0, "completions/min_terminated_length": 1162.0, "entropy": 0.2744132913649082, "epoch": 0.05667506297229219, "frac_reward_zero_std": 0.0, "grad_norm": 0.092085218181057, "kl": 0.0008409910224145278, "learning_rate": 9.922696971310895e-07, "loss": 0.0154, "num_tokens": 6746621.0, "reward": 2.625, "reward_std": 0.46288391947746277, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7968239784240723, "sampling/importance_sampling_ratio/mean": 1.0001109838485718, "sampling/importance_sampling_ratio/min": 0.6489030122756958, "sampling/sampling_logp_difference/max": 0.5860207080841064, "sampling/sampling_logp_difference/mean": 0.0075532360933721066, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4807.0, "completions/max_terminated_length": 4807.0, "completions/mean_length": 2336.58349609375, "completions/mean_terminated_length": 2336.58349609375, "completions/min_length": 1162.0, "completions/min_terminated_length": 1162.0, "entropy": 0.2991136871278286, "epoch": 0.057304785894206546, "frac_reward_zero_std": 0.0, "grad_norm": 0.08036351091918288, "kl": 0.0008391261944780126, "learning_rate": 9.92095468262965e-07, "loss": -0.1218, "num_tokens": 6819307.0, "reward": 2.375, "reward_std": 0.8767603635787964, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.399290919303894, "sampling/importance_sampling_ratio/mean": 1.0000656843185425, "sampling/importance_sampling_ratio/min": 0.7198427319526672, "sampling/sampling_logp_difference/max": 0.335965633392334, "sampling/sampling_logp_difference/mean": 0.007454372011125088, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7753.0, "completions/mean_length": 4304.7919921875, "completions/mean_terminated_length": 4135.78271484375, "completions/min_length": 1752.0, "completions/min_terminated_length": 1752.0, "entropy": 0.419280044734478, "epoch": 0.05793450881612091, "frac_reward_zero_std": 0.0, "grad_norm": 0.102539950726785, "kl": 0.0009857426775852218, "learning_rate": 9.919193134316912e-07, "loss": -0.0204, "num_tokens": 6941518.0, "reward": 2.0833334922790527, "reward_std": 0.6713559031486511, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999597668647766, "sampling/importance_sampling_ratio/min": 0.01566619612276554, "sampling/sampling_logp_difference/max": 4.15625, "sampling/sampling_logp_difference/mean": 0.010207060724496841, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7658.0, "completions/mean_length": 4018.25, "completions/mean_terminated_length": 3836.78271484375, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "entropy": 0.4919777661561966, "epoch": 0.058564231738035266, "frac_reward_zero_std": 0.0, "grad_norm": 0.10950976567144045, "kl": 0.0008027180738281459, "learning_rate": 9.917412333267035e-07, "loss": 0.0007, "num_tokens": 7051604.0, "reward": 1.7916667461395264, "reward_std": 0.42645785212516785, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999785423278809, "sampling/importance_sampling_ratio/min": 0.4706597626209259, "sampling/sampling_logp_difference/max": 0.933851957321167, "sampling/sampling_logp_difference/mean": 0.011662858538329601, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7132.0, "completions/max_terminated_length": 7132.0, "completions/mean_length": 3465.95849609375, "completions/mean_terminated_length": 3465.95849609375, "completions/min_length": 1583.0, "completions/min_terminated_length": 1583.0, "entropy": 0.23647870868444443, "epoch": 0.05919395465994962, "frac_reward_zero_std": 0.0, "grad_norm": 0.06709842946340261, "kl": 0.0006540090689668432, "learning_rate": 9.91561228644971e-07, "loss": 0.0042, "num_tokens": 7176371.0, "reward": 2.0833334922790527, "reward_std": 0.71599280834198, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999713897705078, "sampling/importance_sampling_ratio/min": 0.12677732110023499, "sampling/sampling_logp_difference/max": 2.0653231143951416, "sampling/sampling_logp_difference/mean": 0.006247752346098423, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6247.0, "completions/max_terminated_length": 6247.0, "completions/mean_length": 2519.95849609375, "completions/mean_terminated_length": 2519.95849609375, "completions/min_length": 672.0, "completions/min_terminated_length": 672.0, "entropy": 0.43047668784856796, "epoch": 0.05982367758186398, "frac_reward_zero_std": 0.0, "grad_norm": 0.12600004374087748, "kl": 0.0009529005910735577, "learning_rate": 9.913793000909966e-07, "loss": 0.0965, "num_tokens": 7251802.0, "reward": 1.875, "reward_std": 0.6831681132316589, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.8755401372909546, "sampling/importance_sampling_ratio/mean": 1.0000299215316772, "sampling/importance_sampling_ratio/min": 0.4712376594543457, "sampling/sampling_logp_difference/max": 0.7523927688598633, "sampling/sampling_logp_difference/mean": 0.010272510349750519, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2156.0, "completions/max_terminated_length": 2156.0, "completions/mean_length": 1079.666748046875, "completions/mean_terminated_length": 1079.666748046875, "completions/min_length": 381.0, "completions/min_terminated_length": 381.0, "entropy": 0.28945203125476837, "epoch": 0.060453400503778336, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08947266576064326, "kl": 0.001337665438768454, "learning_rate": 9.911954483768119e-07, "loss": -0.0958, "num_tokens": 7290930.0, "reward": 2.7083334922790527, "reward_std": 0.21362332999706268, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2855576276779175, "sampling/importance_sampling_ratio/mean": 0.9999513626098633, "sampling/importance_sampling_ratio/min": 0.6259210705757141, "sampling/sampling_logp_difference/max": 0.4685310423374176, "sampling/sampling_logp_difference/mean": 0.007411744445562363, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2792.0, "completions/max_terminated_length": 2792.0, "completions/mean_length": 1634.25, "completions/mean_terminated_length": 1634.25, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "entropy": 0.34051821380853653, "epoch": 0.06108312342569269, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09860706062752657, "kl": 0.0011072328052250668, "learning_rate": 9.91009674221976e-07, "loss": 0.0283, "num_tokens": 7339056.0, "reward": 2.8333334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.272341012954712, "sampling/importance_sampling_ratio/mean": 1.000235915184021, "sampling/importance_sampling_ratio/min": 0.7249646186828613, "sampling/sampling_logp_difference/max": 0.32163238525390625, "sampling/sampling_logp_difference/mean": 0.008430426940321922, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7412.0, "completions/max_terminated_length": 7412.0, "completions/mean_length": 2451.291748046875, "completions/mean_terminated_length": 2451.291748046875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.37811753153800964, "epoch": 0.061712846347607056, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0791503177522628, "kl": 0.001003706463961862, "learning_rate": 9.908219783535715e-07, "loss": -0.1273, "num_tokens": 7408607.0, "reward": 2.0833334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000287294387817, "sampling/importance_sampling_ratio/min": 0.6561478972434998, "sampling/sampling_logp_difference/max": 0.7293694019317627, "sampling/sampling_logp_difference/mean": 0.009823106229305267, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4354.0, "completions/max_terminated_length": 4354.0, "completions/mean_length": 2396.625, "completions/mean_terminated_length": 2396.625, "completions/min_length": 1295.0, "completions/min_terminated_length": 1295.0, "entropy": 0.27734624594449997, "epoch": 0.06234256926952141, "frac_reward_zero_std": 0.0, "grad_norm": 0.08485802628207598, "kl": 0.0008417748467763886, "learning_rate": 9.906323615062024e-07, "loss": 0.037, "num_tokens": 7495238.0, "reward": 2.0416667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6092437505722046, "sampling/importance_sampling_ratio/mean": 1.0000120401382446, "sampling/importance_sampling_ratio/min": 0.6201003789901733, "sampling/sampling_logp_difference/max": 0.4778739809989929, "sampling/sampling_logp_difference/mean": 0.007460249122232199, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4175.0, "completions/max_terminated_length": 4175.0, "completions/mean_length": 1627.041748046875, "completions/mean_terminated_length": 1627.041748046875, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "entropy": 0.2800791338086128, "epoch": 0.06297229219143577, "frac_reward_zero_std": 0.0, "grad_norm": 0.11346533887138611, "kl": 0.0011977115063928068, "learning_rate": 9.904408244219915e-07, "loss": 0.0036, "num_tokens": 7544311.0, "reward": 2.2916667461395264, "reward_std": 0.6753765344619751, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.512062430381775, "sampling/importance_sampling_ratio/mean": 0.9997647404670715, "sampling/importance_sampling_ratio/min": 0.7005547881126404, "sampling/sampling_logp_difference/max": 0.41347455978393555, "sampling/sampling_logp_difference/mean": 0.007427469827234745, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7008.0, "completions/mean_length": 3439.375, "completions/mean_terminated_length": 3232.7392578125, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.4785473793745041, "epoch": 0.06360201511335013, "frac_reward_zero_std": 0.0, "grad_norm": 0.0981913486277418, "kl": 0.0011427186254877597, "learning_rate": 9.902473678505761e-07, "loss": 0.0812, "num_tokens": 7646496.0, "reward": 1.5833333730697632, "reward_std": 0.6803731322288513, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.7981657981872559, "sampling/importance_sampling_ratio/mean": 0.9999257922172546, "sampling/importance_sampling_ratio/min": 0.5317656397819519, "sampling/sampling_logp_difference/max": 0.6315524578094482, "sampling/sampling_logp_difference/mean": 0.01101614348590374, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7404.0, "completions/max_terminated_length": 7404.0, "completions/mean_length": 3337.83349609375, "completions/mean_terminated_length": 3337.83349609375, "completions/min_length": 1780.0, "completions/min_terminated_length": 1780.0, "entropy": 0.43085478246212006, "epoch": 0.06423173803526448, "frac_reward_zero_std": 0.0, "grad_norm": 0.09185678199317533, "kl": 0.0009418722620466724, "learning_rate": 9.900519925491067e-07, "loss": 0.0693, "num_tokens": 7739996.0, "reward": 1.625, "reward_std": 0.5566146373748779, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.9121699333190918, "sampling/importance_sampling_ratio/mean": 0.9999627470970154, "sampling/importance_sampling_ratio/min": 0.5346202850341797, "sampling/sampling_logp_difference/max": 0.6482386589050293, "sampling/sampling_logp_difference/mean": 0.011042862199246883, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4675.0, "completions/max_terminated_length": 4675.0, "completions/mean_length": 1760.041748046875, "completions/mean_terminated_length": 1760.041748046875, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "entropy": 0.4049137681722641, "epoch": 0.06486146095717885, "frac_reward_zero_std": 0.0, "grad_norm": 0.1061563237067904, "kl": 0.0013132845924701542, "learning_rate": 9.898546992822432e-07, "loss": -0.1821, "num_tokens": 7789605.0, "reward": 2.1666667461395264, "reward_std": 0.5423438549041748, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3857558965682983, "sampling/importance_sampling_ratio/mean": 1.0001368522644043, "sampling/importance_sampling_ratio/min": 0.7216381430625916, "sampling/sampling_logp_difference/max": 0.3262457847595215, "sampling/sampling_logp_difference/mean": 0.01048242300748825, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5163.0, "completions/mean_length": 2939.541748046875, "completions/mean_terminated_length": 2462.04541015625, "completions/min_length": 1155.0, "completions/min_terminated_length": 1155.0, "entropy": 0.3533059135079384, "epoch": 0.0654911838790932, "frac_reward_zero_std": 0.0, "grad_norm": 0.09744245915748133, "kl": 0.0010975931072607636, "learning_rate": 9.89655488822152e-07, "loss": 0.0939, "num_tokens": 7873874.0, "reward": 1.75, "reward_std": 0.7702247500419617, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.6877269744873047, "sampling/importance_sampling_ratio/mean": 1.0001873970031738, "sampling/importance_sampling_ratio/min": 0.6687234044075012, "sampling/sampling_logp_difference/max": 0.5233826637268066, "sampling/sampling_logp_difference/mean": 0.009434574283659458, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3535.0, "completions/max_terminated_length": 3535.0, "completions/mean_length": 1975.625, "completions/mean_terminated_length": 1975.625, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "entropy": 0.3001897633075714, "epoch": 0.06612090680100756, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08301182096968973, "kl": 0.0010883735958486795, "learning_rate": 9.894543619485026e-07, "loss": -0.0181, "num_tokens": 7933297.0, "reward": 2.2083334922790527, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000734329223633, "sampling/importance_sampling_ratio/min": 0.551615297794342, "sampling/sampling_logp_difference/max": 0.7915081977844238, "sampling/sampling_logp_difference/mean": 0.007933909073472023, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6085.0, "completions/max_terminated_length": 6085.0, "completions/mean_length": 2577.875, "completions/mean_terminated_length": 2577.875, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "entropy": 0.40688369423151016, "epoch": 0.06675062972292191, "frac_reward_zero_std": 0.0, "grad_norm": 0.1200100894015942, "kl": 0.0011067414015997201, "learning_rate": 9.892513194484657e-07, "loss": -0.1475, "num_tokens": 8009222.0, "reward": 2.375, "reward_std": 0.47419947385787964, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6464428901672363, "sampling/importance_sampling_ratio/mean": 0.9999796748161316, "sampling/importance_sampling_ratio/min": 0.03693580999970436, "sampling/sampling_logp_difference/max": 3.2985737323760986, "sampling/sampling_logp_difference/mean": 0.010434087365865707, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5775.0, "completions/max_terminated_length": 5775.0, "completions/mean_length": 2882.58349609375, "completions/mean_terminated_length": 2882.58349609375, "completions/min_length": 1129.0, "completions/min_terminated_length": 1129.0, "entropy": 0.29056142643094063, "epoch": 0.06738035264483627, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08263460398354364, "kl": 0.0010080863867187873, "learning_rate": 9.890463621167089e-07, "loss": -0.0189, "num_tokens": 8096668.0, "reward": 2.2916667461395264, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4579102993011475, "sampling/importance_sampling_ratio/mean": 0.9997231960296631, "sampling/importance_sampling_ratio/min": 0.6214349269866943, "sampling/sampling_logp_difference/max": 0.47572410106658936, "sampling/sampling_logp_difference/mean": 0.007787648588418961, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5058.0, "completions/max_terminated_length": 5058.0, "completions/mean_length": 2319.70849609375, "completions/mean_terminated_length": 2319.70849609375, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "entropy": 0.4266025722026825, "epoch": 0.06801007556675064, "frac_reward_zero_std": 0.0, "grad_norm": 0.09070090985031054, "kl": 0.0011410817533032969, "learning_rate": 9.88839490755394e-07, "loss": 0.0662, "num_tokens": 8163045.0, "reward": 2.0, "reward_std": 0.5260697603225708, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4688998460769653, "sampling/importance_sampling_ratio/mean": 0.999826967716217, "sampling/importance_sampling_ratio/min": 0.6561976075172424, "sampling/sampling_logp_difference/max": 0.4212932586669922, "sampling/sampling_logp_difference/mean": 0.009725000709295273, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5948.0, "completions/max_terminated_length": 5948.0, "completions/mean_length": 2547.166748046875, "completions/mean_terminated_length": 2547.166748046875, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.3749017044901848, "epoch": 0.06863979848866499, "frac_reward_zero_std": 0.0, "grad_norm": 0.10615625998697513, "kl": 0.0010724562598625198, "learning_rate": 9.886307061741743e-07, "loss": -0.0255, "num_tokens": 8240417.0, "reward": 1.5416667461395264, "reward_std": 0.622288167476654, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9972753524780273, "sampling/importance_sampling_ratio/mean": 0.9998630881309509, "sampling/importance_sampling_ratio/min": 0.5021137595176697, "sampling/sampling_logp_difference/max": 0.6917839050292969, "sampling/sampling_logp_difference/mean": 0.009753728285431862, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6161.0, "completions/max_terminated_length": 6161.0, "completions/mean_length": 3655.58349609375, "completions/mean_terminated_length": 3655.58349609375, "completions/min_length": 1278.0, "completions/min_terminated_length": 1278.0, "entropy": 0.44808000326156616, "epoch": 0.06926952141057935, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0628880538598155, "kl": 0.0010511923901503906, "learning_rate": 9.884200091901906e-07, "loss": 0.003, "num_tokens": 8341311.0, "reward": 2.0, "reward_std": 0.5605830550193787, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6500744819641113, "sampling/importance_sampling_ratio/mean": 0.9999232292175293, "sampling/importance_sampling_ratio/min": 0.4714786112308502, "sampling/sampling_logp_difference/max": 0.7518815994262695, "sampling/sampling_logp_difference/mean": 0.011372799053788185, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4518.0, "completions/max_terminated_length": 4518.0, "completions/mean_length": 2462.875, "completions/mean_terminated_length": 2462.875, "completions/min_length": 1311.0, "completions/min_terminated_length": 1311.0, "entropy": 0.3477909490466118, "epoch": 0.0698992443324937, "frac_reward_zero_std": 0.0, "grad_norm": 0.10619083725443619, "kl": 0.0011545615561772138, "learning_rate": 9.88207400628069e-07, "loss": -0.0404, "num_tokens": 8428236.0, "reward": 2.0, "reward_std": 0.5345224738121033, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000321865081787, "sampling/importance_sampling_ratio/min": 0.45139262080192566, "sampling/sampling_logp_difference/max": 2.101231098175049, "sampling/sampling_logp_difference/mean": 0.00963824987411499, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6876.0, "completions/max_terminated_length": 6876.0, "completions/mean_length": 3245.70849609375, "completions/mean_terminated_length": 3245.70849609375, "completions/min_length": 1224.0, "completions/min_terminated_length": 1224.0, "entropy": 0.4173242747783661, "epoch": 0.07052896725440806, "frac_reward_zero_std": 0.0, "grad_norm": 0.11448063301179875, "kl": 0.0011658581788651645, "learning_rate": 9.879928813199167e-07, "loss": -0.1017, "num_tokens": 8518069.0, "reward": 2.25, "reward_std": 0.5643138885498047, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7942841053009033, "sampling/importance_sampling_ratio/mean": 0.9997835159301758, "sampling/importance_sampling_ratio/min": 0.5656030178070068, "sampling/sampling_logp_difference/max": 0.5846061706542969, "sampling/sampling_logp_difference/mean": 0.010739123448729515, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4916.0, "completions/max_terminated_length": 4916.0, "completions/mean_length": 2146.125, "completions/mean_terminated_length": 2146.125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.35864806175231934, "epoch": 0.07115869017632241, "frac_reward_zero_std": 0.0, "grad_norm": 0.09566873499635238, "kl": 0.04087439985596575, "learning_rate": 9.877764521053197e-07, "loss": -0.1846, "num_tokens": 8582080.0, "reward": 1.625, "reward_std": 0.7447940707206726, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5461739301681519, "sampling/importance_sampling_ratio/mean": 1.0000237226486206, "sampling/importance_sampling_ratio/min": 0.5496307611465454, "sampling/sampling_logp_difference/max": 0.5985085964202881, "sampling/sampling_logp_difference/mean": 0.008771594613790512, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7951.0, "completions/mean_length": 4066.25, "completions/mean_terminated_length": 3886.86962890625, "completions/min_length": 1115.0, "completions/min_terminated_length": 1115.0, "entropy": 0.5439133569598198, "epoch": 0.07178841309823678, "frac_reward_zero_std": 0.0, "grad_norm": 0.10237649872181782, "kl": 0.0011129225604236126, "learning_rate": 9.875581138313377e-07, "loss": -0.0111, "num_tokens": 8696982.0, "reward": 1.2916667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.791853904724121, "sampling/importance_sampling_ratio/mean": 1.0000473260879517, "sampling/importance_sampling_ratio/min": 0.5067123770713806, "sampling/sampling_logp_difference/max": 0.6798117160797119, "sampling/sampling_logp_difference/mean": 0.014084621332585812, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7498.0, "completions/max_terminated_length": 7498.0, "completions/mean_length": 2906.33349609375, "completions/mean_terminated_length": 2906.33349609375, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "entropy": 0.35846927762031555, "epoch": 0.07241813602015114, "frac_reward_zero_std": 0.0, "grad_norm": 0.15822256729441198, "kl": 0.0012383708672132343, "learning_rate": 9.873378673525039e-07, "loss": -0.0772, "num_tokens": 8777566.0, "reward": 2.5416667461395264, "reward_std": 0.6621600985527039, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999881386756897, "sampling/importance_sampling_ratio/min": 0.5076982975006104, "sampling/sampling_logp_difference/max": 0.9778890609741211, "sampling/sampling_logp_difference/mean": 0.008894691243767738, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7087.0, "completions/max_terminated_length": 7087.0, "completions/mean_length": 3079.5, "completions/mean_terminated_length": 3079.5, "completions/min_length": 1840.0, "completions/min_terminated_length": 1840.0, "entropy": 0.3420257046818733, "epoch": 0.07304785894206549, "frac_reward_zero_std": 0.0, "grad_norm": 0.09744453407696713, "kl": 0.0010796345595736057, "learning_rate": 9.871157135308186e-07, "loss": -0.0039, "num_tokens": 8869370.0, "reward": 2.125, "reward_std": 0.5461008548736572, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5652151107788086, "sampling/importance_sampling_ratio/mean": 1.0000215768814087, "sampling/importance_sampling_ratio/min": 0.6204928159713745, "sampling/sampling_logp_difference/max": 0.47724127769470215, "sampling/sampling_logp_difference/mean": 0.009149638935923576, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4428.0, "completions/max_terminated_length": 4428.0, "completions/mean_length": 2257.041748046875, "completions/mean_terminated_length": 2257.041748046875, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.2789967395365238, "epoch": 0.07367758186397985, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08073096722551931, "kl": 0.0013537549239117652, "learning_rate": 9.868916532357474e-07, "loss": -0.0734, "num_tokens": 8943619.0, "reward": 2.375, "reward_std": 0.4493255615234375, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000184774398804, "sampling/importance_sampling_ratio/min": 0.5983157753944397, "sampling/sampling_logp_difference/max": 1.36342191696167, "sampling/sampling_logp_difference/mean": 0.007219412829726934, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5092.0, "completions/max_terminated_length": 5092.0, "completions/mean_length": 2541.95849609375, "completions/mean_terminated_length": 2541.95849609375, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "entropy": 0.5761178731918335, "epoch": 0.0743073047858942, "frac_reward_zero_std": 0.0, "grad_norm": 0.1189499050113298, "kl": 0.0012044287868775427, "learning_rate": 9.866656873442174e-07, "loss": -0.0076, "num_tokens": 9024898.0, "reward": 1.8333333730697632, "reward_std": 0.6863929629325867, "rewards/cloze_reward/mean": 0.25, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4941486120224, "sampling/importance_sampling_ratio/mean": 1.0000125169754028, "sampling/importance_sampling_ratio/min": 0.6437978148460388, "sampling/sampling_logp_difference/max": 0.4403705596923828, "sampling/sampling_logp_difference/mean": 0.013288920745253563, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5188.0, "completions/max_terminated_length": 5188.0, "completions/mean_length": 2137.375, "completions/mean_terminated_length": 2137.375, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.38874951750040054, "epoch": 0.07493702770780857, "frac_reward_zero_std": 0.0, "grad_norm": 0.13589117409792464, "kl": 0.0013932335714343935, "learning_rate": 9.864378167406138e-07, "loss": 0.141, "num_tokens": 9087731.0, "reward": 2.125, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7293827533721924, "sampling/importance_sampling_ratio/mean": 0.9998942017555237, "sampling/importance_sampling_ratio/min": 0.6046415567398071, "sampling/sampling_logp_difference/max": 0.5477645397186279, "sampling/sampling_logp_difference/mean": 0.009839019738137722, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6859.0, "completions/mean_length": 4046.375, "completions/mean_terminated_length": 3669.5, "completions/min_length": 1453.0, "completions/min_terminated_length": 1453.0, "entropy": 0.3780215159058571, "epoch": 0.07556675062972293, "frac_reward_zero_std": 0.0, "grad_norm": 0.08353802586774083, "kl": 0.0009922295866999775, "learning_rate": 9.862080423167767e-07, "loss": 0.0962, "num_tokens": 9208028.0, "reward": 1.625, "reward_std": 0.6765334010124207, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.8316923379898071, "sampling/importance_sampling_ratio/mean": 0.9999591708183289, "sampling/importance_sampling_ratio/min": 0.19610591232776642, "sampling/sampling_logp_difference/max": 1.6291003227233887, "sampling/sampling_logp_difference/mean": 0.010079204104840755, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2892.0, "completions/max_terminated_length": 2892.0, "completions/mean_length": 1568.416748046875, "completions/mean_terminated_length": 1568.416748046875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.2604988105595112, "epoch": 0.07619647355163728, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08119784708734208, "kl": 0.0012582674680743366, "learning_rate": 9.85976364971997e-07, "loss": 0.0035, "num_tokens": 9264734.0, "reward": 2.75, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999530911445618, "sampling/importance_sampling_ratio/min": 0.6507281064987183, "sampling/sampling_logp_difference/max": 0.8274250030517578, "sampling/sampling_logp_difference/mean": 0.0067470623180270195, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 8151.0, "completions/mean_length": 4234.95849609375, "completions/mean_terminated_length": 3875.227294921875, "completions/min_length": 1258.0, "completions/min_terminated_length": 1258.0, "entropy": 0.5585973709821701, "epoch": 0.07682619647355164, "frac_reward_zero_std": 0.0, "grad_norm": 0.10096038000633056, "kl": 0.0011543860309757292, "learning_rate": 9.857427856130137e-07, "loss": 0.0005, "num_tokens": 9375293.0, "reward": 1.875, "reward_std": 0.6617234945297241, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998733997344971, "sampling/importance_sampling_ratio/min": 0.011736102402210236, "sampling/sampling_logp_difference/max": 4.445085525512695, "sampling/sampling_logp_difference/mean": 0.013580630533397198, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2515.0, "completions/max_terminated_length": 2515.0, "completions/mean_length": 1556.0, "completions/mean_terminated_length": 1556.0, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "entropy": 0.28654061257839203, "epoch": 0.07745591939546599, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11286075976680283, "kl": 0.0016760084545239806, "learning_rate": 9.855073051540093e-07, "loss": -0.0382, "num_tokens": 9421541.0, "reward": 2.625, "reward_std": 0.367926687002182, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2936087846755981, "sampling/importance_sampling_ratio/mean": 1.0000059604644775, "sampling/importance_sampling_ratio/min": 0.518846333026886, "sampling/sampling_logp_difference/max": 0.6561474800109863, "sampling/sampling_logp_difference/mean": 0.0073918914422392845, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5961.0, "completions/max_terminated_length": 5961.0, "completions/mean_length": 3282.25, "completions/mean_terminated_length": 3282.25, "completions/min_length": 1266.0, "completions/min_terminated_length": 1266.0, "entropy": 0.2662745229899883, "epoch": 0.07808564231738035, "frac_reward_zero_std": 0.0, "grad_norm": 0.07104912061147738, "kl": 0.0010209067695541307, "learning_rate": 9.852699245166076e-07, "loss": -0.1111, "num_tokens": 9535939.0, "reward": 2.0833334922790527, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999554753303528, "sampling/importance_sampling_ratio/min": 0.35440823435783386, "sampling/sampling_logp_difference/max": 1.0373058319091797, "sampling/sampling_logp_difference/mean": 0.007719948887825012, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8066.0, "completions/mean_length": 4141.0, "completions/mean_terminated_length": 3964.86962890625, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "entropy": 0.479095883667469, "epoch": 0.0787153652392947, "frac_reward_zero_std": 0.0, "grad_norm": 0.1017205062929908, "kl": 0.0011434210755396634, "learning_rate": 9.850306446298686e-07, "loss": -0.0097, "num_tokens": 9648939.0, "reward": 1.7083333730697632, "reward_std": 0.7954527139663696, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.6932826042175293, "sampling/importance_sampling_ratio/mean": 1.0001250505447388, "sampling/importance_sampling_ratio/min": 0.5297911167144775, "sampling/sampling_logp_difference/max": 0.6352725028991699, "sampling/sampling_logp_difference/mean": 0.011144552379846573, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4433.0, "completions/max_terminated_length": 4433.0, "completions/mean_length": 2907.916748046875, "completions/mean_terminated_length": 2907.916748046875, "completions/min_length": 1518.0, "completions/min_terminated_length": 1518.0, "entropy": 0.3226708434522152, "epoch": 0.07934508816120907, "frac_reward_zero_std": 0.0, "grad_norm": 0.1213513235481053, "kl": 0.0012391995987854898, "learning_rate": 9.84789466430286e-07, "loss": 0.0231, "num_tokens": 9744097.0, "reward": 2.0833334922790527, "reward_std": 0.6744657754898071, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6716291904449463, "sampling/importance_sampling_ratio/mean": 0.9998860955238342, "sampling/importance_sampling_ratio/min": 0.11209584772586823, "sampling/sampling_logp_difference/max": 2.188400983810425, "sampling/sampling_logp_difference/mean": 0.008849279955029488, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2716.0, "completions/max_terminated_length": 2716.0, "completions/mean_length": 1627.375, "completions/mean_terminated_length": 1627.375, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "entropy": 0.2788871079683304, "epoch": 0.07997481108312343, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.089352075498641, "kl": 0.0016842354089021683, "learning_rate": 9.845463908617827e-07, "loss": -0.0908, "num_tokens": 9799018.0, "reward": 2.3333334922790527, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3148146867752075, "sampling/importance_sampling_ratio/mean": 0.999974250793457, "sampling/importance_sampling_ratio/min": 0.6864936947822571, "sampling/sampling_logp_difference/max": 0.3761582374572754, "sampling/sampling_logp_difference/mean": 0.007439345121383667, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3988.0, "completions/max_terminated_length": 3988.0, "completions/mean_length": 1822.916748046875, "completions/mean_terminated_length": 1822.916748046875, "completions/min_length": 705.0, "completions/min_terminated_length": 705.0, "entropy": 0.33324164152145386, "epoch": 0.08060453400503778, "frac_reward_zero_std": 0.0, "grad_norm": 0.11594465670359363, "kl": 0.001312935637542978, "learning_rate": 9.843014188757083e-07, "loss": -0.051, "num_tokens": 9852144.0, "reward": 2.375, "reward_std": 0.9016474485397339, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9012305736541748, "sampling/importance_sampling_ratio/mean": 0.999911367893219, "sampling/importance_sampling_ratio/min": 0.6478620767593384, "sampling/sampling_logp_difference/max": 0.6425013542175293, "sampling/sampling_logp_difference/mean": 0.008069670759141445, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3101.0, "completions/max_terminated_length": 3101.0, "completions/mean_length": 1795.541748046875, "completions/mean_terminated_length": 1795.541748046875, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.3187796473503113, "epoch": 0.08123425692695214, "frac_reward_zero_std": 0.0, "grad_norm": 0.10702735649248474, "kl": 0.001545560808153823, "learning_rate": 9.840545514308338e-07, "loss": -0.0203, "num_tokens": 9910901.0, "reward": 2.4583334922790527, "reward_std": 0.42645785212516785, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.878493309020996, "sampling/importance_sampling_ratio/mean": 0.9998769760131836, "sampling/importance_sampling_ratio/min": 0.6758046746253967, "sampling/sampling_logp_difference/max": 0.6304700374603271, "sampling/sampling_logp_difference/mean": 0.007947081699967384, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6645.0, "completions/max_terminated_length": 6645.0, "completions/mean_length": 2888.875, "completions/mean_terminated_length": 2888.875, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "entropy": 0.3566534295678139, "epoch": 0.0818639798488665, "frac_reward_zero_std": 0.0, "grad_norm": 0.09132665124965983, "kl": 0.001410718949045986, "learning_rate": 9.83805789493349e-07, "loss": -0.1813, "num_tokens": 10010170.0, "reward": 1.875, "reward_std": 0.6781584620475769, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.646325945854187, "sampling/importance_sampling_ratio/mean": 0.9998917579650879, "sampling/importance_sampling_ratio/min": 0.23434805870056152, "sampling/sampling_logp_difference/max": 1.450947880744934, "sampling/sampling_logp_difference/mean": 0.009303173050284386, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4029.0, "completions/max_terminated_length": 4029.0, "completions/mean_length": 1791.4583740234375, "completions/mean_terminated_length": 1791.4583740234375, "completions/min_length": 780.0, "completions/min_terminated_length": 780.0, "entropy": 0.3106400817632675, "epoch": 0.08249370277078086, "frac_reward_zero_std": 0.0, "grad_norm": 0.10164241606713474, "kl": 0.0015289550356101245, "learning_rate": 9.835551340368585e-07, "loss": -0.147, "num_tokens": 10063877.0, "reward": 2.5, "reward_std": 0.5986984968185425, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3925225734710693, "sampling/importance_sampling_ratio/mean": 0.9999276995658875, "sampling/importance_sampling_ratio/min": 0.6305667161941528, "sampling/sampling_logp_difference/max": 0.4611363410949707, "sampling/sampling_logp_difference/mean": 0.007937481626868248, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4552.0, "completions/max_terminated_length": 4552.0, "completions/mean_length": 2282.625, "completions/mean_terminated_length": 2282.625, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "entropy": 0.34564197063446045, "epoch": 0.08312342569269521, "frac_reward_zero_std": 0.0, "grad_norm": 0.10198306055553523, "kl": 0.0013500663335435092, "learning_rate": 9.83302586042377e-07, "loss": 0.0304, "num_tokens": 10138860.0, "reward": 2.0416667461395264, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5354584455490112, "sampling/importance_sampling_ratio/mean": 0.9997383952140808, "sampling/importance_sampling_ratio/min": 0.5805074572563171, "sampling/sampling_logp_difference/max": 0.543852686882019, "sampling/sampling_logp_difference/mean": 0.009094756096601486, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3582.0, "completions/max_terminated_length": 3582.0, "completions/mean_length": 1951.4583740234375, "completions/mean_terminated_length": 1951.4583740234375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "entropy": 0.44415752589702606, "epoch": 0.08375314861460957, "frac_reward_zero_std": 0.0, "grad_norm": 0.10825958891107168, "kl": 0.0016319558781106025, "learning_rate": 9.830481464983276e-07, "loss": -0.0389, "num_tokens": 10197591.0, "reward": 2.5833334922790527, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7623876333236694, "sampling/importance_sampling_ratio/mean": 1.0001215934753418, "sampling/importance_sampling_ratio/min": 0.5972827672958374, "sampling/sampling_logp_difference/max": 0.5666694641113281, "sampling/sampling_logp_difference/mean": 0.010422902181744576, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5377.0, "completions/max_terminated_length": 5377.0, "completions/mean_length": 2495.666748046875, "completions/mean_terminated_length": 2495.666748046875, "completions/min_length": 503.0, "completions/min_terminated_length": 503.0, "entropy": 0.39490970969200134, "epoch": 0.08438287153652393, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0853964709688724, "kl": 0.001383873721351847, "learning_rate": 9.827918164005353e-07, "loss": 0.0396, "num_tokens": 10277055.0, "reward": 2.5833334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6117613315582275, "sampling/importance_sampling_ratio/mean": 0.9999027252197266, "sampling/importance_sampling_ratio/min": 0.6046438813209534, "sampling/sampling_logp_difference/max": 0.5031156539916992, "sampling/sampling_logp_difference/mean": 0.009498472325503826, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8133.0, "completions/mean_length": 4066.33349609375, "completions/mean_terminated_length": 3886.95654296875, "completions/min_length": 2001.0, "completions/min_terminated_length": 2001.0, "entropy": 0.31912386417388916, "epoch": 0.08501259445843828, "frac_reward_zero_std": 0.0, "grad_norm": 0.07983749203834462, "kl": 0.001106872339732945, "learning_rate": 9.825335967522248e-07, "loss": -0.0198, "num_tokens": 10418343.0, "reward": 1.9583333730697632, "reward_std": 0.695380687713623, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999863505363464, "sampling/importance_sampling_ratio/min": 0.11522023379802704, "sampling/sampling_logp_difference/max": 2.16090989112854, "sampling/sampling_logp_difference/mean": 0.008759896270930767, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6871.0, "completions/mean_length": 3360.166748046875, "completions/mean_terminated_length": 3150.0869140625, "completions/min_length": 1235.0, "completions/min_terminated_length": 1235.0, "entropy": 0.4073001593351364, "epoch": 0.08564231738035265, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09754489941156037, "kl": 0.0013227509043645114, "learning_rate": 9.822734885640162e-07, "loss": -0.0968, "num_tokens": 10514843.0, "reward": 2.2916667461395264, "reward_std": 0.7519629001617432, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998976588249207, "sampling/importance_sampling_ratio/min": 0.668824315071106, "sampling/sampling_logp_difference/max": 0.9340195655822754, "sampling/sampling_logp_difference/mean": 0.009728405624628067, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4119.0, "completions/max_terminated_length": 4119.0, "completions/mean_length": 2094.791748046875, "completions/mean_terminated_length": 2094.791748046875, "completions/min_length": 466.0, "completions/min_terminated_length": 466.0, "entropy": 0.4706733301281929, "epoch": 0.086272040302267, "frac_reward_zero_std": 0.0, "grad_norm": 0.11129184497849672, "kl": 0.0016373182297684252, "learning_rate": 9.820114928539207e-07, "loss": -0.0279, "num_tokens": 10582214.0, "reward": 1.8333333730697632, "reward_std": 0.6816081404685974, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.871256947517395, "sampling/importance_sampling_ratio/mean": 1.0001564025878906, "sampling/importance_sampling_ratio/min": 0.10608359426259995, "sampling/sampling_logp_difference/max": 2.243527889251709, "sampling/sampling_logp_difference/mean": 0.011260932311415672, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6187.0, "completions/mean_length": 3734.5, "completions/mean_terminated_length": 3329.272705078125, "completions/min_length": 1484.0, "completions/min_terminated_length": 1484.0, "entropy": 0.45528871566057205, "epoch": 0.08690176322418136, "frac_reward_zero_std": 0.0, "grad_norm": 0.11080072136223708, "kl": 0.0011396599584259093, "learning_rate": 9.817476106473373e-07, "loss": 0.1766, "num_tokens": 10686026.0, "reward": 1.75, "reward_std": 0.7570996284484863, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4551427364349365, "sampling/importance_sampling_ratio/mean": 0.9999509453773499, "sampling/importance_sampling_ratio/min": 0.1890290230512619, "sampling/sampling_logp_difference/max": 1.6658546924591064, "sampling/sampling_logp_difference/mean": 0.011057078838348389, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4422.0, "completions/max_terminated_length": 4422.0, "completions/mean_length": 1915.875, "completions/mean_terminated_length": 1915.875, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.24479403719305992, "epoch": 0.08753148614609572, "frac_reward_zero_std": 0.0, "grad_norm": 0.09651628632754326, "kl": 0.0015484678151551634, "learning_rate": 9.814818429770479e-07, "loss": -0.0932, "num_tokens": 10758263.0, "reward": 2.4583334922790527, "reward_std": 0.5863928198814392, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6166008710861206, "sampling/importance_sampling_ratio/mean": 1.0001004934310913, "sampling/importance_sampling_ratio/min": 0.5219735503196716, "sampling/sampling_logp_difference/max": 0.6501383781433105, "sampling/sampling_logp_difference/mean": 0.006356148049235344, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5776.0, "completions/mean_length": 3330.20849609375, "completions/mean_terminated_length": 3118.826171875, "completions/min_length": 975.0, "completions/min_terminated_length": 975.0, "entropy": 0.4264685809612274, "epoch": 0.08816120906801007, "frac_reward_zero_std": 0.0, "grad_norm": 0.5466752009549422, "kl": 0.0012587751261889935, "learning_rate": 9.812141908832142e-07, "loss": 0.0643, "num_tokens": 10850148.0, "reward": 1.9583333730697632, "reward_std": 0.5957427024841309, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6860579252243042, "sampling/importance_sampling_ratio/mean": 1.0000256299972534, "sampling/importance_sampling_ratio/min": 0.39407122135162354, "sampling/sampling_logp_difference/max": 0.9312236309051514, "sampling/sampling_logp_difference/mean": 0.010446175932884216, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6187.0, "completions/max_terminated_length": 6187.0, "completions/mean_length": 2146.25, "completions/mean_terminated_length": 2146.25, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "entropy": 0.45592450350522995, "epoch": 0.08879093198992444, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0896147506068088, "kl": 0.0014971649507060647, "learning_rate": 9.809446554133727e-07, "loss": -0.063, "num_tokens": 10911618.0, "reward": 2.2916667461395264, "reward_std": 0.21362332999706268, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5590275526046753, "sampling/importance_sampling_ratio/mean": 0.9999138712882996, "sampling/importance_sampling_ratio/min": 0.5589776635169983, "sampling/sampling_logp_difference/max": 0.5816457271575928, "sampling/sampling_logp_difference/mean": 0.010521872900426388, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4171.0, "completions/max_terminated_length": 4171.0, "completions/mean_length": 2255.791748046875, "completions/mean_terminated_length": 2255.791748046875, "completions/min_length": 1037.0, "completions/min_terminated_length": 1037.0, "entropy": 0.33298663794994354, "epoch": 0.08942065491183879, "frac_reward_zero_std": 0.0, "grad_norm": 0.09022941576462887, "kl": 0.0015007451293058693, "learning_rate": 9.806732376224314e-07, "loss": 0.0212, "num_tokens": 10977365.0, "reward": 2.5416667461395264, "reward_std": 0.5138766169548035, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5263761281967163, "sampling/importance_sampling_ratio/mean": 0.9999563097953796, "sampling/importance_sampling_ratio/min": 0.5090041160583496, "sampling/sampling_logp_difference/max": 0.6752991676330566, "sampling/sampling_logp_difference/mean": 0.008181631565093994, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3773.0, "completions/max_terminated_length": 3773.0, "completions/mean_length": 2130.791748046875, "completions/mean_terminated_length": 2130.791748046875, "completions/min_length": 472.0, "completions/min_terminated_length": 472.0, "entropy": 0.41070906817913055, "epoch": 0.09005037783375315, "frac_reward_zero_std": 0.0, "grad_norm": 0.11170094488730563, "kl": 0.001561674871481955, "learning_rate": 9.803999385726652e-07, "loss": -0.0841, "num_tokens": 11040656.0, "reward": 2.5, "reward_std": 0.6803731322288513, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000017762184143, "sampling/importance_sampling_ratio/min": 0.49484193325042725, "sampling/sampling_logp_difference/max": 0.8965229988098145, "sampling/sampling_logp_difference/mean": 0.010071562603116035, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6966.0, "completions/max_terminated_length": 6966.0, "completions/mean_length": 2301.666748046875, "completions/mean_terminated_length": 2301.666748046875, "completions/min_length": 967.0, "completions/min_terminated_length": 967.0, "entropy": 0.29975274577736855, "epoch": 0.0906801007556675, "frac_reward_zero_std": 0.0, "grad_norm": 0.12387964169550274, "kl": 0.0014343600196298212, "learning_rate": 9.801247593337118e-07, "loss": -0.1763, "num_tokens": 11107712.0, "reward": 2.0, "reward_std": 0.6288648843765259, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4708359241485596, "sampling/importance_sampling_ratio/mean": 1.0001784563064575, "sampling/importance_sampling_ratio/min": 0.6899948716163635, "sampling/sampling_logp_difference/max": 0.3858308792114258, "sampling/sampling_logp_difference/mean": 0.007844698615372181, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6601.0, "completions/max_terminated_length": 6601.0, "completions/mean_length": 3599.375, "completions/mean_terminated_length": 3599.375, "completions/min_length": 1467.0, "completions/min_terminated_length": 1467.0, "entropy": 0.456609271466732, "epoch": 0.09130982367758186, "frac_reward_zero_std": 0.0, "grad_norm": 0.10721397795880636, "kl": 0.0012475104595068842, "learning_rate": 9.798477009825677e-07, "loss": 0.0045, "num_tokens": 11212537.0, "reward": 1.6666667461395264, "reward_std": 0.6015613079071045, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001850128173828, "sampling/importance_sampling_ratio/min": 0.39424505829811096, "sampling/sampling_logp_difference/max": 0.9988222122192383, "sampling/sampling_logp_difference/mean": 0.011334974318742752, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6718.0, "completions/max_terminated_length": 6718.0, "completions/mean_length": 2898.875, "completions/mean_terminated_length": 2898.875, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "entropy": 0.3217344731092453, "epoch": 0.09193954659949623, "frac_reward_zero_std": 0.0, "grad_norm": 0.09035470512340633, "kl": 0.0013234440702944994, "learning_rate": 9.795687646035837e-07, "loss": 0.0801, "num_tokens": 11312982.0, "reward": 2.125, "reward_std": 0.7288650274276733, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5247482061386108, "sampling/importance_sampling_ratio/mean": 0.9999173283576965, "sampling/importance_sampling_ratio/min": 0.4186219274997711, "sampling/sampling_logp_difference/max": 0.8707871437072754, "sampling/sampling_logp_difference/mean": 0.008322684094309807, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4168.0, "completions/max_terminated_length": 4168.0, "completions/mean_length": 2184.83349609375, "completions/mean_terminated_length": 2184.83349609375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "entropy": 0.27261799573898315, "epoch": 0.09256926952141058, "frac_reward_zero_std": 0.0, "grad_norm": 0.09953771403343403, "kl": 0.001751499919919297, "learning_rate": 9.79287951288461e-07, "loss": -0.0832, "num_tokens": 11389850.0, "reward": 2.0416667461395264, "reward_std": 0.6439208984375, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9150339365005493, "sampling/importance_sampling_ratio/mean": 0.9999136924743652, "sampling/importance_sampling_ratio/min": 0.47057512402534485, "sampling/sampling_logp_difference/max": 0.7537996768951416, "sampling/sampling_logp_difference/mean": 0.007268694695085287, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3823.0, "completions/max_terminated_length": 3823.0, "completions/mean_length": 1857.875, "completions/mean_terminated_length": 1857.875, "completions/min_length": 1043.0, "completions/min_terminated_length": 1043.0, "entropy": 0.2828839421272278, "epoch": 0.09319899244332494, "frac_reward_zero_std": 0.0, "grad_norm": 0.1766122448916352, "kl": 0.001605220721103251, "learning_rate": 9.790052621362468e-07, "loss": -0.0228, "num_tokens": 11446023.0, "reward": 2.5416667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7388403415679932, "sampling/importance_sampling_ratio/mean": 1.0001403093338013, "sampling/importance_sampling_ratio/min": 0.5473614931106567, "sampling/sampling_logp_difference/max": 0.6026458740234375, "sampling/sampling_logp_difference/mean": 0.00738956592977047, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6514.0, "completions/max_terminated_length": 6514.0, "completions/mean_length": 2132.041748046875, "completions/mean_terminated_length": 2132.041748046875, "completions/min_length": 983.0, "completions/min_terminated_length": 983.0, "entropy": 0.34400393068790436, "epoch": 0.09382871536523929, "frac_reward_zero_std": 0.0, "grad_norm": 0.09299739661480184, "kl": 0.0018359389796387404, "learning_rate": 9.7872069825333e-07, "loss": -0.0521, "num_tokens": 11511040.0, "reward": 2.7083334922790527, "reward_std": 0.6055297255516052, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6616686582565308, "sampling/importance_sampling_ratio/mean": 0.9999704360961914, "sampling/importance_sampling_ratio/min": 0.6341878771781921, "sampling/sampling_logp_difference/max": 0.5078222751617432, "sampling/sampling_logp_difference/mean": 0.008521191775798798, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7340.0, "completions/max_terminated_length": 7340.0, "completions/mean_length": 3228.5, "completions/mean_terminated_length": 3228.5, "completions/min_length": 1112.0, "completions/min_terminated_length": 1112.0, "entropy": 0.4372124597430229, "epoch": 0.09445843828715365, "frac_reward_zero_std": 0.0, "grad_norm": 0.0916678386916059, "kl": 0.0016735615499783307, "learning_rate": 9.784342607534362e-07, "loss": -0.0909, "num_tokens": 11603740.0, "reward": 1.8333333730697632, "reward_std": 0.5970090627670288, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9498347043991089, "sampling/importance_sampling_ratio/mean": 0.99993896484375, "sampling/importance_sampling_ratio/min": 0.3570214807987213, "sampling/sampling_logp_difference/max": 1.0299594402313232, "sampling/sampling_logp_difference/mean": 0.011452614329755306, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6969.0, "completions/mean_length": 3600.375, "completions/mean_terminated_length": 3400.7392578125, "completions/min_length": 787.0, "completions/min_terminated_length": 787.0, "entropy": 0.45368898659944534, "epoch": 0.095088161209068, "frac_reward_zero_std": 0.0, "grad_norm": 0.11460536056205627, "kl": 0.001458976388676092, "learning_rate": 9.781459507576248e-07, "loss": 0.0995, "num_tokens": 11700213.0, "reward": 2.5, "reward_std": 0.7463539838790894, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7510156631469727, "sampling/importance_sampling_ratio/mean": 0.9998540282249451, "sampling/importance_sampling_ratio/min": 0.4891478419303894, "sampling/sampling_logp_difference/max": 0.7150905132293701, "sampling/sampling_logp_difference/mean": 0.011685380712151527, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5339.0, "completions/mean_length": 2366.375, "completions/mean_terminated_length": 2113.0869140625, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "entropy": 0.4097738191485405, "epoch": 0.09571788413098237, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07780117552841895, "kl": 0.0015736535715404898, "learning_rate": 9.778557693942832e-07, "loss": 0.022, "num_tokens": 11779918.0, "reward": 1.9583333730697632, "reward_std": 0.5410773754119873, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5248353481292725, "sampling/importance_sampling_ratio/mean": 0.999896764755249, "sampling/importance_sampling_ratio/min": 0.5372345447540283, "sampling/sampling_logp_difference/max": 0.6213204860687256, "sampling/sampling_logp_difference/mean": 0.009797719307243824, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5707.0, "completions/max_terminated_length": 5707.0, "completions/mean_length": 2939.83349609375, "completions/mean_terminated_length": 2939.83349609375, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "entropy": 0.3986974135041237, "epoch": 0.09634760705289673, "frac_reward_zero_std": 0.0, "grad_norm": 0.08790718465175089, "kl": 0.0018549556552898139, "learning_rate": 9.775637177991235e-07, "loss": 0.0215, "num_tokens": 11863842.0, "reward": 2.375, "reward_std": 0.5241308212280273, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.547068476676941, "sampling/importance_sampling_ratio/mean": 0.9999014735221863, "sampling/importance_sampling_ratio/min": 0.37435832619667053, "sampling/sampling_logp_difference/max": 0.9825417995452881, "sampling/sampling_logp_difference/mean": 0.00990060530602932, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4038.0, "completions/max_terminated_length": 4038.0, "completions/mean_length": 1978.416748046875, "completions/mean_terminated_length": 1978.416748046875, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.30446501448750496, "epoch": 0.09697732997481108, "frac_reward_zero_std": 0.0, "grad_norm": 0.09873771569536285, "kl": 0.0018139056046493351, "learning_rate": 9.772697971151768e-07, "loss": -0.0972, "num_tokens": 11926332.0, "reward": 2.5, "reward_std": 0.7326550483703613, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.8728822469711304, "sampling/importance_sampling_ratio/mean": 0.9999809861183167, "sampling/importance_sampling_ratio/min": 0.6287346482276917, "sampling/sampling_logp_difference/max": 0.6274785995483398, "sampling/sampling_logp_difference/mean": 0.007685003336519003, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7934.0, "completions/mean_length": 2833.33349609375, "completions/mean_terminated_length": 2600.347900390625, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "entropy": 0.3943259194493294, "epoch": 0.09760705289672544, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12361634339456905, "kl": 0.0016702164430171251, "learning_rate": 9.7697400849279e-07, "loss": 0.1457, "num_tokens": 12003988.0, "reward": 2.5, "reward_std": 0.4714045226573944, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7300028800964355, "sampling/importance_sampling_ratio/mean": 1.0000587701797485, "sampling/importance_sampling_ratio/min": 0.6879798173904419, "sampling/sampling_logp_difference/max": 0.5481230020523071, "sampling/sampling_logp_difference/mean": 0.00970379263162613, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6975.0, "completions/mean_length": 3349.70849609375, "completions/mean_terminated_length": 3139.174072265625, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "entropy": 0.5156123489141464, "epoch": 0.0982367758186398, "frac_reward_zero_std": 0.0, "grad_norm": 0.11548378209745706, "kl": 0.0014363856171257794, "learning_rate": 9.766763530896203e-07, "loss": 0.2659, "num_tokens": 12094053.0, "reward": 2.1666667461395264, "reward_std": 0.7017480731010437, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000799894332886, "sampling/importance_sampling_ratio/min": 0.5131649374961853, "sampling/sampling_logp_difference/max": 0.8785820007324219, "sampling/sampling_logp_difference/mean": 0.011559121310710907, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3361.0, "completions/max_terminated_length": 3361.0, "completions/mean_length": 1828.0833740234375, "completions/mean_terminated_length": 1828.0833740234375, "completions/min_length": 950.0, "completions/min_terminated_length": 950.0, "entropy": 0.3417332395911217, "epoch": 0.09886649874055416, "frac_reward_zero_std": 0.0, "grad_norm": 0.10791935030904441, "kl": 0.0019283184374216944, "learning_rate": 9.763768320706319e-07, "loss": -0.0418, "num_tokens": 12151863.0, "reward": 2.5, "reward_std": 0.5681797862052917, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3409053087234497, "sampling/importance_sampling_ratio/mean": 0.9998175501823425, "sampling/importance_sampling_ratio/min": 0.6833913326263428, "sampling/sampling_logp_difference/max": 0.3806877136230469, "sampling/sampling_logp_difference/mean": 0.00839892029762268, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7468.0, "completions/max_terminated_length": 7468.0, "completions/mean_length": 2412.416748046875, "completions/mean_terminated_length": 2412.416748046875, "completions/min_length": 1299.0, "completions/min_terminated_length": 1299.0, "entropy": 0.30660803988575935, "epoch": 0.09949622166246852, "frac_reward_zero_std": 0.0, "grad_norm": 0.1050902282575144, "kl": 0.001568466832395643, "learning_rate": 9.760754466080895e-07, "loss": 0.1421, "num_tokens": 12243417.0, "reward": 2.2916667461395264, "reward_std": 0.5049939155578613, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7787411212921143, "sampling/importance_sampling_ratio/mean": 0.9998791813850403, "sampling/importance_sampling_ratio/min": 0.622646152973175, "sampling/sampling_logp_difference/max": 0.5759057998657227, "sampling/sampling_logp_difference/mean": 0.008270813152194023, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3275.0, "completions/max_terminated_length": 3275.0, "completions/mean_length": 1931.0833740234375, "completions/mean_terminated_length": 1931.0833740234375, "completions/min_length": 586.0, "completions/min_terminated_length": 586.0, "entropy": 0.3598572611808777, "epoch": 0.10012594458438287, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06384607025597415, "kl": 0.0016721365682315081, "learning_rate": 9.757721978815558e-07, "loss": -0.0311, "num_tokens": 12305443.0, "reward": 2.25, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3602097034454346, "sampling/importance_sampling_ratio/mean": 0.9998583793640137, "sampling/importance_sampling_ratio/min": 0.49077168107032776, "sampling/sampling_logp_difference/max": 0.7117762565612793, "sampling/sampling_logp_difference/mean": 0.008979862555861473, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6461.0, "completions/mean_length": 3639.125, "completions/mean_terminated_length": 3441.174072265625, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "entropy": 0.3815739154815674, "epoch": 0.10075566750629723, "frac_reward_zero_std": 0.0, "grad_norm": 0.07610920359187946, "kl": 0.0013142449606675655, "learning_rate": 9.754670870778857e-07, "loss": 0.0812, "num_tokens": 12407862.0, "reward": 2.0416667461395264, "reward_std": 0.5863928198814392, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000672340393066, "sampling/importance_sampling_ratio/min": 0.5307820439338684, "sampling/sampling_logp_difference/max": 0.9220438003540039, "sampling/sampling_logp_difference/mean": 0.009388674050569534, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3259.0, "completions/max_terminated_length": 3259.0, "completions/mean_length": 1629.3333740234375, "completions/mean_terminated_length": 1629.3333740234375, "completions/min_length": 762.0, "completions/min_terminated_length": 762.0, "entropy": 0.3584264740347862, "epoch": 0.10138539042821158, "frac_reward_zero_std": 0.0, "grad_norm": 0.11812896895688048, "kl": 0.0021230900892987847, "learning_rate": 9.751601153912215e-07, "loss": 0.0218, "num_tokens": 12455918.0, "reward": 2.5416667461395264, "reward_std": 0.5175491571426392, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4804399013519287, "sampling/importance_sampling_ratio/mean": 1.00009286403656, "sampling/importance_sampling_ratio/min": 0.721239447593689, "sampling/sampling_logp_difference/max": 0.39233922958374023, "sampling/sampling_logp_difference/mean": 0.008846789598464966, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6598.0, "completions/max_terminated_length": 6598.0, "completions/mean_length": 3332.416748046875, "completions/mean_terminated_length": 3332.416748046875, "completions/min_length": 1665.0, "completions/min_terminated_length": 1665.0, "entropy": 0.44656170159578323, "epoch": 0.10201511335012595, "frac_reward_zero_std": 0.0, "grad_norm": 0.09054181645370867, "kl": 0.0016116077022161335, "learning_rate": 9.748512840229892e-07, "loss": -0.1235, "num_tokens": 12549664.0, "reward": 2.0833334922790527, "reward_std": 0.5404430627822876, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3873975276947021, "sampling/importance_sampling_ratio/mean": 1.0000718832015991, "sampling/importance_sampling_ratio/min": 0.3820738196372986, "sampling/sampling_logp_difference/max": 0.9621415138244629, "sampling/sampling_logp_difference/mean": 0.010450507514178753, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6929.0, "completions/max_terminated_length": 6929.0, "completions/mean_length": 3626.166748046875, "completions/mean_terminated_length": 3626.166748046875, "completions/min_length": 1263.0, "completions/min_terminated_length": 1263.0, "entropy": 0.48378925770521164, "epoch": 0.1026448362720403, "frac_reward_zero_std": 0.0, "grad_norm": 0.10779597421912106, "kl": 0.0016832906403578818, "learning_rate": 9.745405941818927e-07, "loss": -0.1924, "num_tokens": 12648788.0, "reward": 2.2083334922790527, "reward_std": 0.503990888595581, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7354457378387451, "sampling/importance_sampling_ratio/mean": 1.0000392198562622, "sampling/importance_sampling_ratio/min": 0.4912559986114502, "sampling/sampling_logp_difference/max": 0.7107899188995361, "sampling/sampling_logp_difference/mean": 0.01181984506547451, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6644.0, "completions/max_terminated_length": 6644.0, "completions/mean_length": 2753.666748046875, "completions/mean_terminated_length": 2753.666748046875, "completions/min_length": 1317.0, "completions/min_terminated_length": 1317.0, "entropy": 0.4353404566645622, "epoch": 0.10327455919395466, "frac_reward_zero_std": 0.0, "grad_norm": 0.10987142647401736, "kl": 0.0017576567770447582, "learning_rate": 9.742280470839102e-07, "loss": 0.0248, "num_tokens": 12729916.0, "reward": 2.0833334922790527, "reward_std": 0.5260697603225708, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4316015243530273, "sampling/importance_sampling_ratio/mean": 0.9999632835388184, "sampling/importance_sampling_ratio/min": 0.019642481580376625, "sampling/sampling_logp_difference/max": 3.930060625076294, "sampling/sampling_logp_difference/mean": 0.010959958657622337, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6969.0, "completions/mean_length": 3515.95849609375, "completions/mean_terminated_length": 3090.86376953125, "completions/min_length": 721.0, "completions/min_terminated_length": 721.0, "entropy": 0.44604121893644333, "epoch": 0.10390428211586902, "frac_reward_zero_std": 0.0, "grad_norm": 0.09932920281083114, "kl": 0.0018321133975405246, "learning_rate": 9.739136439522882e-07, "loss": 0.1983, "num_tokens": 12827019.0, "reward": 1.7916667461395264, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.628807783126831, "sampling/importance_sampling_ratio/mean": 0.9998701214790344, "sampling/importance_sampling_ratio/min": 0.5979943871498108, "sampling/sampling_logp_difference/max": 0.5141739845275879, "sampling/sampling_logp_difference/mean": 0.010962098836898804, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3799.0, "completions/max_terminated_length": 3799.0, "completions/mean_length": 1822.416748046875, "completions/mean_terminated_length": 1822.416748046875, "completions/min_length": 905.0, "completions/min_terminated_length": 905.0, "entropy": 0.4000724032521248, "epoch": 0.10453400503778337, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0835342945219738, "kl": 0.0020459661027416587, "learning_rate": 9.735973860175375e-07, "loss": -0.0696, "num_tokens": 12884941.0, "reward": 2.375, "reward_std": 0.21362332999706268, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8082060813903809, "sampling/importance_sampling_ratio/mean": 1.0001088380813599, "sampling/importance_sampling_ratio/min": 0.48814451694488525, "sampling/sampling_logp_difference/max": 0.7171437740325928, "sampling/sampling_logp_difference/mean": 0.009857280179858208, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2476.0, "completions/max_terminated_length": 2476.0, "completions/mean_length": 1565.375, "completions/mean_terminated_length": 1565.375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.27283092960715294, "epoch": 0.10516372795969774, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08543426123003045, "kl": 0.002088782930513844, "learning_rate": 9.732792745174286e-07, "loss": 0.0256, "num_tokens": 12933734.0, "reward": 2.8333334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.674343466758728, "sampling/importance_sampling_ratio/mean": 1.0001271963119507, "sampling/importance_sampling_ratio/min": 0.6072112917900085, "sampling/sampling_logp_difference/max": 0.5154211521148682, "sampling/sampling_logp_difference/mean": 0.006998860742896795, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3377.0, "completions/max_terminated_length": 3377.0, "completions/mean_length": 1946.4583740234375, "completions/mean_terminated_length": 1946.4583740234375, "completions/min_length": 754.0, "completions/min_terminated_length": 754.0, "entropy": 0.5305140987038612, "epoch": 0.10579345088161209, "frac_reward_zero_std": 0.0, "grad_norm": 0.1194313682780162, "kl": 0.001989933691220358, "learning_rate": 9.72959310696986e-07, "loss": -0.0058, "num_tokens": 12989065.0, "reward": 2.2916667461395264, "reward_std": 0.6218037605285645, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4588122367858887, "sampling/importance_sampling_ratio/mean": 0.999995768070221, "sampling/importance_sampling_ratio/min": 0.6999390721321106, "sampling/sampling_logp_difference/max": 0.3776226043701172, "sampling/sampling_logp_difference/mean": 0.01203297358006239, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5249.0, "completions/max_terminated_length": 5249.0, "completions/mean_length": 2287.75, "completions/mean_terminated_length": 2287.75, "completions/min_length": 864.0, "completions/min_terminated_length": 864.0, "entropy": 0.47971030324697495, "epoch": 0.10642317380352645, "frac_reward_zero_std": 0.0, "grad_norm": 0.11470299309219652, "kl": 0.0021282942325342447, "learning_rate": 9.726374958084842e-07, "loss": -0.0772, "num_tokens": 13053539.0, "reward": 2.375, "reward_std": 0.6380135416984558, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7975267171859741, "sampling/importance_sampling_ratio/mean": 0.9999487996101379, "sampling/importance_sampling_ratio/min": 0.48779723048210144, "sampling/sampling_logp_difference/max": 0.7178554534912109, "sampling/sampling_logp_difference/mean": 0.011269193142652512, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6241.0, "completions/max_terminated_length": 6241.0, "completions/mean_length": 2309.666748046875, "completions/mean_terminated_length": 2309.666748046875, "completions/min_length": 706.0, "completions/min_terminated_length": 706.0, "entropy": 0.43445179611444473, "epoch": 0.1070528967254408, "frac_reward_zero_std": 0.0, "grad_norm": 0.12290399978561721, "kl": 0.001873214408988133, "learning_rate": 9.723138311114423e-07, "loss": 0.0446, "num_tokens": 13118515.0, "reward": 2.2083334922790527, "reward_std": 0.6523736715316772, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7487831115722656, "sampling/importance_sampling_ratio/mean": 1.0001795291900635, "sampling/importance_sampling_ratio/min": 0.6410769820213318, "sampling/sampling_logp_difference/max": 0.55892014503479, "sampling/sampling_logp_difference/mean": 0.009943627752363682, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3167.0, "completions/max_terminated_length": 3167.0, "completions/mean_length": 1378.5, "completions/mean_terminated_length": 1378.5, "completions/min_length": 557.0, "completions/min_terminated_length": 557.0, "entropy": 0.380528062582016, "epoch": 0.10768261964735516, "frac_reward_zero_std": 0.0, "grad_norm": 0.13237884331595906, "kl": 0.0021276990009937435, "learning_rate": 9.719883178726191e-07, "loss": -0.0224, "num_tokens": 13158351.0, "reward": 2.3333334922790527, "reward_std": 0.6495786905288696, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3063515424728394, "sampling/importance_sampling_ratio/mean": 1.0004056692123413, "sampling/importance_sampling_ratio/min": 0.7275006175041199, "sampling/sampling_logp_difference/max": 0.31814050674438477, "sampling/sampling_logp_difference/mean": 0.009094382636249065, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3859.0, "completions/max_terminated_length": 3859.0, "completions/mean_length": 1558.8333740234375, "completions/mean_terminated_length": 1558.8333740234375, "completions/min_length": 737.0, "completions/min_terminated_length": 737.0, "entropy": 0.39246417582035065, "epoch": 0.10831234256926953, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10924767176425527, "kl": 0.0020914071646984667, "learning_rate": 9.716609573660081e-07, "loss": -0.0111, "num_tokens": 13205403.0, "reward": 2.7916667461395264, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002511739730835, "sampling/importance_sampling_ratio/min": 0.5285110473632812, "sampling/sampling_logp_difference/max": 1.0365679264068604, "sampling/sampling_logp_difference/mean": 0.00970109086483717, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6010.0, "completions/max_terminated_length": 6010.0, "completions/mean_length": 2637.041748046875, "completions/mean_terminated_length": 2637.041748046875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "entropy": 0.381532683968544, "epoch": 0.10894206549118388, "frac_reward_zero_std": 0.0, "grad_norm": 0.10938591461069024, "kl": 0.0018117943545803428, "learning_rate": 9.71331750872833e-07, "loss": -0.144, "num_tokens": 13287996.0, "reward": 2.2916667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7878934144973755, "sampling/importance_sampling_ratio/mean": 1.0000194311141968, "sampling/importance_sampling_ratio/min": 0.49781087040901184, "sampling/sampling_logp_difference/max": 0.6975350379943848, "sampling/sampling_logp_difference/mean": 0.009844837710261345, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5290.0, "completions/max_terminated_length": 5290.0, "completions/mean_length": 3151.95849609375, "completions/mean_terminated_length": 3151.95849609375, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "entropy": 0.3938901424407959, "epoch": 0.10957178841309824, "frac_reward_zero_std": 0.0, "grad_norm": 0.08343113234014167, "kl": 0.0016309983911924064, "learning_rate": 9.71000699681542e-07, "loss": -0.0901, "num_tokens": 13380979.0, "reward": 2.0416667461395264, "reward_std": 0.6354264616966248, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998776316642761, "sampling/importance_sampling_ratio/min": 0.55330491065979, "sampling/sampling_logp_difference/max": 1.8638663291931152, "sampling/sampling_logp_difference/mean": 0.010072361677885056, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6885.0, "completions/max_terminated_length": 6885.0, "completions/mean_length": 2618.416748046875, "completions/mean_terminated_length": 2618.416748046875, "completions/min_length": 898.0, "completions/min_terminated_length": 898.0, "entropy": 0.3405621871352196, "epoch": 0.11020151133501259, "frac_reward_zero_std": 0.0, "grad_norm": 0.10878753704888033, "kl": 0.0016582478128839284, "learning_rate": 9.70667805087803e-07, "loss": -0.132, "num_tokens": 13472045.0, "reward": 2.3333334922790527, "reward_std": 0.5232069492340088, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000478029251099, "sampling/importance_sampling_ratio/min": 0.0673350840806961, "sampling/sampling_logp_difference/max": 2.6980738639831543, "sampling/sampling_logp_difference/mean": 0.00913037545979023, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3606.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 2410.041748046875, "completions/mean_terminated_length": 2410.041748046875, "completions/min_length": 1221.0, "completions/min_terminated_length": 1221.0, "entropy": 0.2979360818862915, "epoch": 0.11083123425692695, "frac_reward_zero_std": 0.0, "grad_norm": 0.13324333365540636, "kl": 0.0019041244813706726, "learning_rate": 9.703330683944992e-07, "loss": -0.0084, "num_tokens": 13555238.0, "reward": 2.2916667461395264, "reward_std": 0.5049939155578613, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4279104471206665, "sampling/importance_sampling_ratio/mean": 0.9998199343681335, "sampling/importance_sampling_ratio/min": 0.4753560423851013, "sampling/sampling_logp_difference/max": 0.7436912059783936, "sampling/sampling_logp_difference/mean": 0.007840598002076149, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4809.0, "completions/max_terminated_length": 4809.0, "completions/mean_length": 2282.20849609375, "completions/mean_terminated_length": 2282.20849609375, "completions/min_length": 1113.0, "completions/min_terminated_length": 1113.0, "entropy": 0.3495143726468086, "epoch": 0.11146095717884132, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08790594805126842, "kl": 0.002060112892650068, "learning_rate": 9.699964909117226e-07, "loss": -0.131, "num_tokens": 13630171.0, "reward": 2.0833334922790527, "reward_std": 0.45069071650505066, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.4536765813827515, "sampling/importance_sampling_ratio/mean": 0.9999914169311523, "sampling/importance_sampling_ratio/min": 0.5593324303627014, "sampling/sampling_logp_difference/max": 0.5810112953186035, "sampling/sampling_logp_difference/mean": 0.00866493210196495, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4014.0, "completions/max_terminated_length": 4014.0, "completions/mean_length": 2492.5, "completions/mean_terminated_length": 2492.5, "completions/min_length": 1125.0, "completions/min_terminated_length": 1125.0, "entropy": 0.43072986602783203, "epoch": 0.11209068010075567, "frac_reward_zero_std": 0.0, "grad_norm": 0.10548712929534185, "kl": 0.00181352804065682, "learning_rate": 9.696580739567704e-07, "loss": 0.061, "num_tokens": 13709143.0, "reward": 2.4166667461395264, "reward_std": 0.5260698199272156, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999814629554749, "sampling/importance_sampling_ratio/min": 0.4672411382198334, "sampling/sampling_logp_difference/max": 0.7609097957611084, "sampling/sampling_logp_difference/mean": 0.010302710346877575, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5788.0, "completions/mean_length": 2804.95849609375, "completions/mean_terminated_length": 2570.7392578125, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 0.41891147941350937, "epoch": 0.11272040302267003, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07413564750193495, "kl": 0.0018033761298283935, "learning_rate": 9.693178188541387e-07, "loss": 0.101, "num_tokens": 13786238.0, "reward": 2.625, "reward_std": 0.33034375309944153, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.6096357107162476, "sampling/importance_sampling_ratio/mean": 1.0000547170639038, "sampling/importance_sampling_ratio/min": 0.6332558989524841, "sampling/sampling_logp_difference/max": 0.47600793838500977, "sampling/sampling_logp_difference/mean": 0.00992595124989748, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7167.0, "completions/mean_length": 3046.375, "completions/mean_terminated_length": 2578.591064453125, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "entropy": 0.3403974547982216, "epoch": 0.11335012594458438, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0983131960221579, "kl": 0.001782115490641445, "learning_rate": 9.689757269355178e-07, "loss": 0.0901, "num_tokens": 13869807.0, "reward": 2.5, "reward_std": 0.623338520526886, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999960720539093, "sampling/importance_sampling_ratio/min": 0.5742309093475342, "sampling/sampling_logp_difference/max": 0.9655296802520752, "sampling/sampling_logp_difference/mean": 0.008149371482431889, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2715.0, "completions/max_terminated_length": 2715.0, "completions/mean_length": 1705.0833740234375, "completions/mean_terminated_length": 1705.0833740234375, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.42713191360235214, "epoch": 0.11397984886649874, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11497190474062041, "kl": 0.002550514240283519, "learning_rate": 9.686317995397872e-07, "loss": -0.0106, "num_tokens": 13920585.0, "reward": 2.5, "reward_std": 0.5261822938919067, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2699114084243774, "sampling/importance_sampling_ratio/mean": 0.9998380541801453, "sampling/importance_sampling_ratio/min": 0.7354353070259094, "sampling/sampling_logp_difference/max": 0.3072926998138428, "sampling/sampling_logp_difference/mean": 0.010205830447375774, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7610.0, "completions/max_terminated_length": 7610.0, "completions/mean_length": 2589.33349609375, "completions/mean_terminated_length": 2589.33349609375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "entropy": 0.3813788890838623, "epoch": 0.11460957178841309, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07232588056210575, "kl": 0.002094320923788473, "learning_rate": 9.682860380130094e-07, "loss": 0.0675, "num_tokens": 13994561.0, "reward": 2.4583334922790527, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6467883586883545, "sampling/importance_sampling_ratio/mean": 1.0000277757644653, "sampling/importance_sampling_ratio/min": 0.6192190647125244, "sampling/sampling_logp_difference/max": 0.4988269805908203, "sampling/sampling_logp_difference/mean": 0.009392744861543179, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8001.0, "completions/mean_length": 3171.666748046875, "completions/mean_terminated_length": 2953.391357421875, "completions/min_length": 1354.0, "completions/min_terminated_length": 1354.0, "entropy": 0.48455896228551865, "epoch": 0.11523929471032746, "frac_reward_zero_std": 0.0, "grad_norm": 0.1255742608894579, "kl": 0.0020032976754009724, "learning_rate": 9.679384437084263e-07, "loss": 0.0379, "num_tokens": 14084777.0, "reward": 2.2083334922790527, "reward_std": 0.599763035774231, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.3735930919647217, "sampling/importance_sampling_ratio/mean": 1.0000680685043335, "sampling/importance_sampling_ratio/min": 0.670304000377655, "sampling/sampling_logp_difference/max": 0.4000239372253418, "sampling/sampling_logp_difference/mean": 0.01124223880469799, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6901.0, "completions/max_terminated_length": 6901.0, "completions/mean_length": 2515.45849609375, "completions/mean_terminated_length": 2515.45849609375, "completions/min_length": 866.0, "completions/min_terminated_length": 866.0, "entropy": 0.3404723182320595, "epoch": 0.11586901763224182, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.091866166920994, "kl": 0.001884366327431053, "learning_rate": 9.675890179864522e-07, "loss": -0.1075, "num_tokens": 14157428.0, "reward": 2.375, "reward_std": 0.4244926869869232, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7356654405593872, "sampling/importance_sampling_ratio/mean": 0.9998121857643127, "sampling/importance_sampling_ratio/min": 0.40625491738319397, "sampling/sampling_logp_difference/max": 0.9007744789123535, "sampling/sampling_logp_difference/mean": 0.008797121234238148, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6285.0, "completions/mean_length": 3093.916748046875, "completions/mean_terminated_length": 2872.260986328125, "completions/min_length": 1083.0, "completions/min_terminated_length": 1083.0, "entropy": 0.4577285274863243, "epoch": 0.11649874055415617, "frac_reward_zero_std": 0.0, "grad_norm": 0.10559368331189706, "kl": 0.0016982359520625323, "learning_rate": 9.672377622146695e-07, "loss": 0.0911, "num_tokens": 14240490.0, "reward": 2.0416667461395264, "reward_std": 0.7116548418998718, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.445383071899414, "sampling/importance_sampling_ratio/mean": 1.000003695487976, "sampling/importance_sampling_ratio/min": 0.6396164894104004, "sampling/sampling_logp_difference/max": 0.4468865394592285, "sampling/sampling_logp_difference/mean": 0.011409996077418327, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6891.0, "completions/max_terminated_length": 6891.0, "completions/mean_length": 2463.5, "completions/mean_terminated_length": 2463.5, "completions/min_length": 733.0, "completions/min_terminated_length": 733.0, "entropy": 0.39114487916231155, "epoch": 0.11712846347607053, "frac_reward_zero_std": 0.0, "grad_norm": 0.09916541015431729, "kl": 0.0019598373328335583, "learning_rate": 9.668846777678229e-07, "loss": -0.0117, "num_tokens": 14310262.0, "reward": 2.6666667461395264, "reward_std": 0.6503192186355591, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.557985544204712, "sampling/importance_sampling_ratio/mean": 1.0000633001327515, "sampling/importance_sampling_ratio/min": 0.7089758515357971, "sampling/sampling_logp_difference/max": 0.4433937072753906, "sampling/sampling_logp_difference/mean": 0.009575293399393559, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4678.0, "completions/max_terminated_length": 4678.0, "completions/mean_length": 2562.83349609375, "completions/mean_terminated_length": 2562.83349609375, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "entropy": 0.359483540058136, "epoch": 0.11775818639798488, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09221804361248535, "kl": 0.0019439614843577147, "learning_rate": 9.665297660278142e-07, "loss": 0.0251, "num_tokens": 14389250.0, "reward": 2.25, "reward_std": 0.45069071650505066, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5412163734436035, "sampling/importance_sampling_ratio/mean": 0.9999703764915466, "sampling/importance_sampling_ratio/min": 0.5789297819137573, "sampling/sampling_logp_difference/max": 0.5465741157531738, "sampling/sampling_logp_difference/mean": 0.009138579480350018, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5882.0, "completions/max_terminated_length": 5882.0, "completions/mean_length": 2339.70849609375, "completions/mean_terminated_length": 2339.70849609375, "completions/min_length": 765.0, "completions/min_terminated_length": 765.0, "entropy": 0.4093446508049965, "epoch": 0.11838790931989925, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06187873715268646, "kl": 0.0017126238672062755, "learning_rate": 9.661730283836968e-07, "loss": 0.0332, "num_tokens": 14453139.0, "reward": 2.4583334922790527, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4521199464797974, "sampling/importance_sampling_ratio/mean": 0.9999344348907471, "sampling/importance_sampling_ratio/min": 0.6408275365829468, "sampling/sampling_logp_difference/max": 0.4449949264526367, "sampling/sampling_logp_difference/mean": 0.010293442755937576, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6837.0, "completions/max_terminated_length": 6837.0, "completions/mean_length": 3038.375, "completions/mean_terminated_length": 3038.375, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "entropy": 0.3030368536710739, "epoch": 0.1190176322418136, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09066859775232701, "kl": 0.0017574737139511853, "learning_rate": 9.658144662316708e-07, "loss": 0.086, "num_tokens": 14558724.0, "reward": 2.1666667461395264, "reward_std": 0.4993361234664917, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7081581354141235, "sampling/importance_sampling_ratio/mean": 0.9998790621757507, "sampling/importance_sampling_ratio/min": 0.548177182674408, "sampling/sampling_logp_difference/max": 0.6011567115783691, "sampling/sampling_logp_difference/mean": 0.007942762225866318, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4500.0, "completions/max_terminated_length": 4500.0, "completions/mean_length": 1815.875, "completions/mean_terminated_length": 1815.875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "entropy": 0.3057568445801735, "epoch": 0.11964735516372796, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08241017160670554, "kl": 0.002134767477400601, "learning_rate": 9.654540809750766e-07, "loss": -0.0379, "num_tokens": 14613857.0, "reward": 2.7083334922790527, "reward_std": 0.48371022939682007, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4427698850631714, "sampling/importance_sampling_ratio/mean": 1.0000886917114258, "sampling/importance_sampling_ratio/min": 0.6155476570129395, "sampling/sampling_logp_difference/max": 0.4852428436279297, "sampling/sampling_logp_difference/mean": 0.007995788007974625, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7328.0, "completions/max_terminated_length": 7328.0, "completions/mean_length": 2992.166748046875, "completions/mean_terminated_length": 2992.166748046875, "completions/min_length": 1141.0, "completions/min_terminated_length": 1141.0, "entropy": 0.36270932108163834, "epoch": 0.12027707808564232, "frac_reward_zero_std": 0.0, "grad_norm": 0.08147739680657762, "kl": 0.0019099388155154884, "learning_rate": 9.650918740243897e-07, "loss": -0.1455, "num_tokens": 14696893.0, "reward": 2.4583334922790527, "reward_std": 0.7653362154960632, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000224113464355, "sampling/importance_sampling_ratio/min": 0.45282211899757385, "sampling/sampling_logp_difference/max": 0.7922558784484863, "sampling/sampling_logp_difference/mean": 0.009419341571629047, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4594.0, "completions/max_terminated_length": 4594.0, "completions/mean_length": 2611.08349609375, "completions/mean_terminated_length": 2611.08349609375, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.3501971960067749, "epoch": 0.12090680100755667, "frac_reward_zero_std": 0.0, "grad_norm": 0.10495603036921357, "kl": 0.0020367557008285075, "learning_rate": 9.647278467972156e-07, "loss": 0.0359, "num_tokens": 14783223.0, "reward": 2.0416667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7877534627914429, "sampling/importance_sampling_ratio/mean": 0.9996924996376038, "sampling/importance_sampling_ratio/min": 0.5183831453323364, "sampling/sampling_logp_difference/max": 0.6570405960083008, "sampling/sampling_logp_difference/mean": 0.009327799081802368, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6591.0, "completions/max_terminated_length": 6591.0, "completions/mean_length": 2741.541748046875, "completions/mean_terminated_length": 2741.541748046875, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.2919098734855652, "epoch": 0.12153652392947104, "frac_reward_zero_std": 0.0, "grad_norm": 0.0877808000043943, "kl": 0.001815613213693723, "learning_rate": 9.64362000718284e-07, "loss": -0.0245, "num_tokens": 14869076.0, "reward": 2.5, "reward_std": 0.5260697603225708, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6193519830703735, "sampling/importance_sampling_ratio/mean": 0.9997871518135071, "sampling/importance_sampling_ratio/min": 0.34109747409820557, "sampling/sampling_logp_difference/max": 1.0755870342254639, "sampling/sampling_logp_difference/mean": 0.007828141562640667, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6749.0, "completions/max_terminated_length": 6749.0, "completions/mean_length": 3764.541748046875, "completions/mean_terminated_length": 3764.541748046875, "completions/min_length": 1050.0, "completions/min_terminated_length": 1050.0, "entropy": 0.47637584060430527, "epoch": 0.12216624685138538, "frac_reward_zero_std": 0.0, "grad_norm": 0.11226027569033566, "kl": 0.0016908227698877454, "learning_rate": 9.639943372194435e-07, "loss": 0.0548, "num_tokens": 14974505.0, "reward": 2.3333334922790527, "reward_std": 0.6398054361343384, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7351831197738647, "sampling/importance_sampling_ratio/mean": 0.9999608993530273, "sampling/importance_sampling_ratio/min": 0.6380823850631714, "sampling/sampling_logp_difference/max": 0.5511128902435303, "sampling/sampling_logp_difference/mean": 0.011316128075122833, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3174.0, "completions/max_terminated_length": 3174.0, "completions/mean_length": 1384.875, "completions/mean_terminated_length": 1384.875, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "entropy": 0.44625575095415115, "epoch": 0.12279596977329975, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07549872994504009, "kl": 0.002911793941166252, "learning_rate": 9.636248577396553e-07, "loss": -0.1459, "num_tokens": 15020182.0, "reward": 2.7916667461395264, "reward_std": 0.39591166377067566, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3453413248062134, "sampling/importance_sampling_ratio/mean": 1.0000463724136353, "sampling/importance_sampling_ratio/min": 0.70003342628479, "sampling/sampling_logp_difference/max": 0.3566272258758545, "sampling/sampling_logp_difference/mean": 0.010548440739512444, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4987.0, "completions/max_terminated_length": 4987.0, "completions/mean_length": 2165.25, "completions/mean_terminated_length": 2165.25, "completions/min_length": 1039.0, "completions/min_terminated_length": 1039.0, "entropy": 0.42303019762039185, "epoch": 0.12342569269521411, "frac_reward_zero_std": 0.0, "grad_norm": 0.11766249900842378, "kl": 0.0020734346471726894, "learning_rate": 9.63253563724988e-07, "loss": -0.0465, "num_tokens": 15095436.0, "reward": 2.3333334922790527, "reward_std": 0.5681797862052917, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.935102105140686, "sampling/importance_sampling_ratio/mean": 1.000091791152954, "sampling/importance_sampling_ratio/min": 0.6943457722663879, "sampling/sampling_logp_difference/max": 0.6601600646972656, "sampling/sampling_logp_difference/mean": 0.010687198489904404, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5088.0, "completions/max_terminated_length": 5088.0, "completions/mean_length": 2212.58349609375, "completions/mean_terminated_length": 2212.58349609375, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "entropy": 0.23262885212898254, "epoch": 0.12405541561712846, "frac_reward_zero_std": 0.0, "grad_norm": 0.08206938072038267, "kl": 0.0018694950267672539, "learning_rate": 9.62880456628612e-07, "loss": 0.0394, "num_tokens": 15184002.0, "reward": 2.25, "reward_std": 0.584453821182251, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6292325258255005, "sampling/importance_sampling_ratio/mean": 0.9998909831047058, "sampling/importance_sampling_ratio/min": 0.5679710507392883, "sampling/sampling_logp_difference/max": 0.5656847953796387, "sampling/sampling_logp_difference/mean": 0.006082319654524326, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4956.0, "completions/max_terminated_length": 4956.0, "completions/mean_length": 2294.916748046875, "completions/mean_terminated_length": 2294.916748046875, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.3605474531650543, "epoch": 0.12468513853904283, "frac_reward_zero_std": 0.0, "grad_norm": 0.10642078667575514, "kl": 0.001975873688934371, "learning_rate": 9.62505537910794e-07, "loss": -0.0959, "num_tokens": 15260600.0, "reward": 2.0, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000001192092896, "sampling/importance_sampling_ratio/min": 0.4264587461948395, "sampling/sampling_logp_difference/max": 0.9619989395141602, "sampling/sampling_logp_difference/mean": 0.009168088436126709, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7212.0, "completions/max_terminated_length": 7212.0, "completions/mean_length": 3099.375, "completions/mean_terminated_length": 3099.375, "completions/min_length": 1287.0, "completions/min_terminated_length": 1287.0, "entropy": 0.34072525054216385, "epoch": 0.1253148614609572, "frac_reward_zero_std": 0.0, "grad_norm": 0.09040720632438631, "kl": 0.0017943163984455168, "learning_rate": 9.621288090388905e-07, "loss": 0.0258, "num_tokens": 15345921.0, "reward": 2.8333334922790527, "reward_std": 0.3900056481361389, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.880078673362732, "sampling/importance_sampling_ratio/mean": 0.9999769330024719, "sampling/importance_sampling_ratio/min": 0.3429229259490967, "sampling/sampling_logp_difference/max": 1.0702495574951172, "sampling/sampling_logp_difference/mean": 0.008060719817876816, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6914.0, "completions/mean_length": 3488.08349609375, "completions/mean_terminated_length": 3283.565185546875, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "entropy": 0.46677885204553604, "epoch": 0.12594458438287154, "frac_reward_zero_std": 0.0, "grad_norm": 0.11164087618486052, "kl": 0.002034559001913294, "learning_rate": 9.617502714873433e-07, "loss": 0.0746, "num_tokens": 15444715.0, "reward": 2.25, "reward_std": 0.6838971376419067, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7316901683807373, "sampling/importance_sampling_ratio/mean": 0.9998013973236084, "sampling/importance_sampling_ratio/min": 0.22277173399925232, "sampling/sampling_logp_difference/max": 1.5016076564788818, "sampling/sampling_logp_difference/mean": 0.011464744806289673, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7843.0, "completions/mean_length": 2776.33349609375, "completions/mean_terminated_length": 2540.86962890625, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "entropy": 0.43915142118930817, "epoch": 0.1265743073047859, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10703564721870742, "kl": 0.001962874608580023, "learning_rate": 9.613699267376725e-07, "loss": 0.0634, "num_tokens": 15521075.0, "reward": 2.3333334922790527, "reward_std": 0.4497717618942261, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9295635223388672, "sampling/importance_sampling_ratio/mean": 1.0000934600830078, "sampling/importance_sampling_ratio/min": 0.4760001003742218, "sampling/sampling_logp_difference/max": 0.7423372268676758, "sampling/sampling_logp_difference/mean": 0.010724632069468498, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7526.0, "completions/mean_length": 5451.7919921875, "completions/mean_terminated_length": 5060.33349609375, "completions/min_length": 2634.0, "completions/min_terminated_length": 2634.0, "entropy": 0.5746569186449051, "epoch": 0.12720403022670027, "frac_reward_zero_std": 0.0, "grad_norm": 0.0874738972655521, "kl": 0.0017667466890998185, "learning_rate": 9.609877762784709e-07, "loss": 0.1515, "num_tokens": 15667062.0, "reward": 2.0, "reward_std": 0.9880760908126831, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.7619211673736572, "sampling/importance_sampling_ratio/mean": 0.9999957084655762, "sampling/importance_sampling_ratio/min": 0.4974243640899658, "sampling/sampling_logp_difference/max": 0.6983118057250977, "sampling/sampling_logp_difference/mean": 0.01400892436504364, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7434.0, "completions/max_terminated_length": 7434.0, "completions/mean_length": 3089.58349609375, "completions/mean_terminated_length": 3089.58349609375, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.44042227417230606, "epoch": 0.12783375314861462, "frac_reward_zero_std": 0.0, "grad_norm": 0.07732638759332983, "kl": 0.00200339691946283, "learning_rate": 9.606038216053994e-07, "loss": -0.0703, "num_tokens": 15763308.0, "reward": 2.25, "reward_std": 0.6354002952575684, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5744726657867432, "sampling/importance_sampling_ratio/mean": 0.9999411702156067, "sampling/importance_sampling_ratio/min": 0.0027388620655983686, "sampling/sampling_logp_difference/max": 5.90021276473999, "sampling/sampling_logp_difference/mean": 0.010759817436337471, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4970.0, "completions/max_terminated_length": 4970.0, "completions/mean_length": 1955.0833740234375, "completions/mean_terminated_length": 1955.0833740234375, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "entropy": 0.31780073791742325, "epoch": 0.12846347607052896, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09147893212943038, "kl": 0.002253812679555267, "learning_rate": 9.602180642211792e-07, "loss": -0.0731, "num_tokens": 15820014.0, "reward": 2.75, "reward_std": 0.34503278136253357, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4137465953826904, "sampling/importance_sampling_ratio/mean": 1.000131368637085, "sampling/importance_sampling_ratio/min": 0.5373886823654175, "sampling/sampling_logp_difference/max": 0.6210336685180664, "sampling/sampling_logp_difference/mean": 0.008121917955577374, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5909.0, "completions/max_terminated_length": 5909.0, "completions/mean_length": 3413.416748046875, "completions/mean_terminated_length": 3413.416748046875, "completions/min_length": 1362.0, "completions/min_terminated_length": 1362.0, "entropy": 0.3635251894593239, "epoch": 0.12909319899244331, "frac_reward_zero_std": 0.0, "grad_norm": 0.08675387254620867, "kl": 0.0021474766253959388, "learning_rate": 9.59830505635588e-07, "loss": -0.0588, "num_tokens": 15922248.0, "reward": 2.3333334922790527, "reward_std": 0.6398054361343384, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6095517873764038, "sampling/importance_sampling_ratio/mean": 1.0001691579818726, "sampling/importance_sampling_ratio/min": 0.5596595406532288, "sampling/sampling_logp_difference/max": 0.5804266929626465, "sampling/sampling_logp_difference/mean": 0.009486234746873379, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5177.0, "completions/max_terminated_length": 5177.0, "completions/mean_length": 2725.291748046875, "completions/mean_terminated_length": 2725.291748046875, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "entropy": 0.35095570981502533, "epoch": 0.1297229219143577, "frac_reward_zero_std": 0.0, "grad_norm": 0.09036359693614657, "kl": 0.0021428452455438673, "learning_rate": 9.594411473654523e-07, "loss": 0.0041, "num_tokens": 16009471.0, "reward": 2.25, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998865723609924, "sampling/importance_sampling_ratio/min": 0.5790965557098389, "sampling/sampling_logp_difference/max": 0.8571099042892456, "sampling/sampling_logp_difference/mean": 0.009079134091734886, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7425.0, "completions/max_terminated_length": 7425.0, "completions/mean_length": 3089.45849609375, "completions/mean_terminated_length": 3089.45849609375, "completions/min_length": 835.0, "completions/min_terminated_length": 835.0, "entropy": 0.4969359189271927, "epoch": 0.13035264483627204, "frac_reward_zero_std": 0.0, "grad_norm": 0.10613679093046699, "kl": 0.0023056210775393993, "learning_rate": 9.590499909346423e-07, "loss": -0.0934, "num_tokens": 16092610.0, "reward": 2.5416667461395264, "reward_std": 0.6685322523117065, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.729591727256775, "sampling/importance_sampling_ratio/mean": 1.0000340938568115, "sampling/importance_sampling_ratio/min": 0.643634557723999, "sampling/sampling_logp_difference/max": 0.5478854179382324, "sampling/sampling_logp_difference/mean": 0.01272481307387352, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2478.0, "completions/max_terminated_length": 2478.0, "completions/mean_length": 1464.875, "completions/mean_terminated_length": 1464.875, "completions/min_length": 536.0, "completions/min_terminated_length": 536.0, "entropy": 0.30990201234817505, "epoch": 0.1309823677581864, "frac_reward_zero_std": 0.0, "grad_norm": 0.12242950101800404, "kl": 0.002873534627724439, "learning_rate": 9.586570378740661e-07, "loss": 0.074, "num_tokens": 16139151.0, "reward": 2.5416667461395264, "reward_std": 0.5175491571426392, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.425347924232483, "sampling/importance_sampling_ratio/mean": 0.9999297261238098, "sampling/importance_sampling_ratio/min": 0.6712461709976196, "sampling/sampling_logp_difference/max": 0.3986194133758545, "sampling/sampling_logp_difference/mean": 0.007553353440016508, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5093.0, "completions/max_terminated_length": 5093.0, "completions/mean_length": 2413.041748046875, "completions/mean_terminated_length": 2413.041748046875, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "entropy": 0.5708277225494385, "epoch": 0.13161209068010077, "frac_reward_zero_std": 0.0, "grad_norm": 0.12183975030933042, "kl": 0.0025956694153137505, "learning_rate": 9.582622897216635e-07, "loss": 0.0033, "num_tokens": 16209624.0, "reward": 2.25, "reward_std": 0.5989742279052734, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.6646809577941895, "sampling/importance_sampling_ratio/mean": 1.0001367330551147, "sampling/importance_sampling_ratio/min": 0.6421232223510742, "sampling/sampling_logp_difference/max": 0.5096335411071777, "sampling/sampling_logp_difference/mean": 0.013355601578950882, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7582.0, "completions/max_terminated_length": 7582.0, "completions/mean_length": 2305.95849609375, "completions/mean_terminated_length": 2305.95849609375, "completions/min_length": 927.0, "completions/min_terminated_length": 927.0, "entropy": 0.4468117728829384, "epoch": 0.13224181360201512, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1242878000085787, "kl": 0.0025528239784762263, "learning_rate": 9.578657480223992e-07, "loss": -0.1821, "num_tokens": 16275343.0, "reward": 2.2083334922790527, "reward_std": 0.48371022939682007, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6411560773849487, "sampling/importance_sampling_ratio/mean": 0.9999744296073914, "sampling/importance_sampling_ratio/min": 0.5306658148765564, "sampling/sampling_logp_difference/max": 0.6336228847503662, "sampling/sampling_logp_difference/mean": 0.01040590088814497, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 8060.0, "completions/mean_length": 4294.7919921875, "completions/mean_terminated_length": 3940.5, "completions/min_length": 1996.0, "completions/min_terminated_length": 1996.0, "entropy": 0.24798433855175972, "epoch": 0.13287153652392947, "frac_reward_zero_std": 0.0, "grad_norm": 0.08579764491804641, "kl": 0.0016219160170294344, "learning_rate": 9.574674143282586e-07, "loss": 0.0734, "num_tokens": 16435058.0, "reward": 1.625, "reward_std": 0.7410844564437866, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.8109854459762573, "sampling/importance_sampling_ratio/mean": 0.999876081943512, "sampling/importance_sampling_ratio/min": 0.4812748432159424, "sampling/sampling_logp_difference/max": 0.7313168048858643, "sampling/sampling_logp_difference/mean": 0.006967098452150822, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2745.0, "completions/max_terminated_length": 2745.0, "completions/mean_length": 1632.791748046875, "completions/mean_terminated_length": 1632.791748046875, "completions/min_length": 859.0, "completions/min_terminated_length": 859.0, "entropy": 0.32008063793182373, "epoch": 0.13350125944584382, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09885470738204433, "kl": 0.0027627762174233794, "learning_rate": 9.570672901982399e-07, "loss": -0.0543, "num_tokens": 16491829.0, "reward": 2.6666667461395264, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8773869276046753, "sampling/importance_sampling_ratio/mean": 0.9999025464057922, "sampling/importance_sampling_ratio/min": 0.6199236512184143, "sampling/sampling_logp_difference/max": 0.6298809051513672, "sampling/sampling_logp_difference/mean": 0.008160213939845562, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5540.0, "completions/max_terminated_length": 5540.0, "completions/mean_length": 2395.58349609375, "completions/mean_terminated_length": 2395.58349609375, "completions/min_length": 1192.0, "completions/min_terminated_length": 1192.0, "entropy": 0.30693718791007996, "epoch": 0.1341309823677582, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07978774016106004, "kl": 0.002118054195307195, "learning_rate": 9.566653771983486e-07, "loss": 0.0415, "num_tokens": 16567827.0, "reward": 2.5833334922790527, "reward_std": 0.40627965331077576, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.465942621231079, "sampling/importance_sampling_ratio/mean": 0.9999415874481201, "sampling/importance_sampling_ratio/min": 0.615109920501709, "sampling/sampling_logp_difference/max": 0.48595428466796875, "sampling/sampling_logp_difference/mean": 0.00837300531566143, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8174.0, "completions/mean_length": 3668.666748046875, "completions/mean_terminated_length": 3472.0, "completions/min_length": 1566.0, "completions/min_terminated_length": 1566.0, "entropy": 0.3273930773139, "epoch": 0.13476070528967254, "frac_reward_zero_std": 0.0, "grad_norm": 0.08746702208666754, "kl": 0.0017201379523612559, "learning_rate": 9.562616769015918e-07, "loss": -0.0259, "num_tokens": 16677291.0, "reward": 2.2916667461395264, "reward_std": 0.6784341335296631, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000685453414917, "sampling/importance_sampling_ratio/min": 0.5649370551109314, "sampling/sampling_logp_difference/max": 1.0466623306274414, "sampling/sampling_logp_difference/mean": 0.008202552795410156, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3779.0, "completions/max_terminated_length": 3779.0, "completions/mean_length": 1750.666748046875, "completions/mean_terminated_length": 1750.666748046875, "completions/min_length": 535.0, "completions/min_terminated_length": 535.0, "entropy": 0.3380280211567879, "epoch": 0.1353904282115869, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10171451541512459, "kl": 0.0028700066031888127, "learning_rate": 9.558561908879717e-07, "loss": 0.0054, "num_tokens": 16728107.0, "reward": 2.875, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3795042037963867, "sampling/importance_sampling_ratio/mean": 1.0000109672546387, "sampling/importance_sampling_ratio/min": 0.61398845911026, "sampling/sampling_logp_difference/max": 0.4877791404724121, "sampling/sampling_logp_difference/mean": 0.00832158513367176, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4611.0, "completions/max_terminated_length": 4611.0, "completions/mean_length": 2244.541748046875, "completions/mean_terminated_length": 2244.541748046875, "completions/min_length": 543.0, "completions/min_terminated_length": 543.0, "entropy": 0.4141591340303421, "epoch": 0.13602015113350127, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0970031075765478, "kl": 0.0026419550995342433, "learning_rate": 9.554489207444795e-07, "loss": -0.1032, "num_tokens": 16795528.0, "reward": 1.875, "reward_std": 0.48464712500572205, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.777106523513794, "sampling/importance_sampling_ratio/mean": 0.9998747706413269, "sampling/importance_sampling_ratio/min": 0.2823575735092163, "sampling/sampling_logp_difference/max": 1.2645809650421143, "sampling/sampling_logp_difference/mean": 0.010411503724753857, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3327.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 2236.95849609375, "completions/mean_terminated_length": 2236.95849609375, "completions/min_length": 1366.0, "completions/min_terminated_length": 1366.0, "entropy": 0.33826161175966263, "epoch": 0.13664987405541562, "frac_reward_zero_std": 0.0, "grad_norm": 0.10748671003017937, "kl": 0.0025175250484608114, "learning_rate": 9.55039868065089e-07, "loss": 0.003, "num_tokens": 16869359.0, "reward": 2.5, "reward_std": 0.5748276710510254, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7181551456451416, "sampling/importance_sampling_ratio/mean": 1.0001200437545776, "sampling/importance_sampling_ratio/min": 0.6113288402557373, "sampling/sampling_logp_difference/max": 0.5412511825561523, "sampling/sampling_logp_difference/mean": 0.008706027641892433, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5025.0, "completions/max_terminated_length": 5025.0, "completions/mean_length": 3083.45849609375, "completions/mean_terminated_length": 3083.45849609375, "completions/min_length": 1081.0, "completions/min_terminated_length": 1081.0, "entropy": 0.7055185586214066, "epoch": 0.13727959697732997, "frac_reward_zero_std": 0.0, "grad_norm": 0.11631475703637083, "kl": 0.0026918044313788414, "learning_rate": 9.5462903445075e-07, "loss": -0.0974, "num_tokens": 16952410.0, "reward": 2.125, "reward_std": 0.45032867789268494, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6682480573654175, "sampling/importance_sampling_ratio/mean": 1.0001956224441528, "sampling/importance_sampling_ratio/min": 0.6731621026992798, "sampling/sampling_logp_difference/max": 0.5117740035057068, "sampling/sampling_logp_difference/mean": 0.015241038054227829, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7196.0, "completions/max_terminated_length": 7196.0, "completions/mean_length": 2821.75, "completions/mean_terminated_length": 2821.75, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "entropy": 0.3447025790810585, "epoch": 0.13790931989924432, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08982778493638695, "kl": 0.0023329863906838, "learning_rate": 9.542164215093837e-07, "loss": -0.1082, "num_tokens": 17036564.0, "reward": 2.4166667461395264, "reward_std": 0.487678587436676, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5754797458648682, "sampling/importance_sampling_ratio/mean": 0.9999448657035828, "sampling/importance_sampling_ratio/min": 0.49057433009147644, "sampling/sampling_logp_difference/max": 0.7121784687042236, "sampling/sampling_logp_difference/mean": 0.008757824078202248, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7786.0, "completions/max_terminated_length": 7786.0, "completions/mean_length": 2410.70849609375, "completions/mean_terminated_length": 2410.70849609375, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "entropy": 0.3741969019174576, "epoch": 0.1385390428211587, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11590841770395033, "kl": 0.0027456604584585875, "learning_rate": 9.538020308558743e-07, "loss": -0.0986, "num_tokens": 17106445.0, "reward": 2.4166667461395264, "reward_std": 0.36585909128189087, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6755340099334717, "sampling/importance_sampling_ratio/mean": 0.9999172687530518, "sampling/importance_sampling_ratio/min": 0.6453542709350586, "sampling/sampling_logp_difference/max": 0.5161318778991699, "sampling/sampling_logp_difference/mean": 0.009333131834864616, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7391.0, "completions/mean_length": 3539.791748046875, "completions/mean_terminated_length": 3337.521728515625, "completions/min_length": 1195.0, "completions/min_terminated_length": 1195.0, "entropy": 0.4390391334891319, "epoch": 0.13916876574307305, "frac_reward_zero_std": 0.0, "grad_norm": 0.1052950418552125, "kl": 0.0021466654434334487, "learning_rate": 9.533858641120638e-07, "loss": 0.082, "num_tokens": 17203536.0, "reward": 2.25, "reward_std": 0.7013811469078064, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5610169172286987, "sampling/importance_sampling_ratio/mean": 1.0000765323638916, "sampling/importance_sampling_ratio/min": 0.5527652502059937, "sampling/sampling_logp_difference/max": 0.5928218364715576, "sampling/sampling_logp_difference/mean": 0.01117000263184309, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6783.0, "completions/max_terminated_length": 6783.0, "completions/mean_length": 2975.83349609375, "completions/mean_terminated_length": 2975.83349609375, "completions/min_length": 1059.0, "completions/min_terminated_length": 1059.0, "entropy": 0.4318165108561516, "epoch": 0.1397984886649874, "frac_reward_zero_std": 0.0, "grad_norm": 0.09026561328968491, "kl": 0.0021844521688763052, "learning_rate": 9.529679229067455e-07, "loss": -0.0821, "num_tokens": 17296828.0, "reward": 2.4166667461395264, "reward_std": 0.4629100561141968, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6883540153503418, "sampling/importance_sampling_ratio/mean": 1.0000004768371582, "sampling/importance_sampling_ratio/min": 0.6915233731269836, "sampling/sampling_logp_difference/max": 0.5237541198730469, "sampling/sampling_logp_difference/mean": 0.010845146141946316, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7979.0, "completions/max_terminated_length": 7979.0, "completions/mean_length": 3055.375, "completions/mean_terminated_length": 3055.375, "completions/min_length": 525.0, "completions/min_terminated_length": 525.0, "entropy": 0.45554961264133453, "epoch": 0.14042821158690177, "frac_reward_zero_std": 0.0, "grad_norm": 0.18226256510947011, "kl": 0.002869397692847997, "learning_rate": 9.525482088756577e-07, "loss": 0.14, "num_tokens": 17382773.0, "reward": 2.25, "reward_std": 0.7108919024467468, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.52273690700531, "sampling/importance_sampling_ratio/mean": 1.0001264810562134, "sampling/importance_sampling_ratio/min": 0.41880062222480774, "sampling/sampling_logp_difference/max": 0.8703603744506836, "sampling/sampling_logp_difference/mean": 0.011398951523005962, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6101.0, "completions/max_terminated_length": 6101.0, "completions/mean_length": 2492.25, "completions/mean_terminated_length": 2492.25, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "entropy": 0.38182948529720306, "epoch": 0.14105793450881612, "frac_reward_zero_std": 0.0, "grad_norm": 0.09624019990654605, "kl": 0.002552328980527818, "learning_rate": 9.521267236614772e-07, "loss": -0.0591, "num_tokens": 17454443.0, "reward": 2.6666667461395264, "reward_std": 0.46854168176651, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5595643520355225, "sampling/importance_sampling_ratio/mean": 1.0001375675201416, "sampling/importance_sampling_ratio/min": 0.7003143429756165, "sampling/sampling_logp_difference/max": 0.44440650939941406, "sampling/sampling_logp_difference/mean": 0.009750105440616608, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5984.0, "completions/max_terminated_length": 5984.0, "completions/mean_length": 3652.08349609375, "completions/mean_terminated_length": 3652.08349609375, "completions/min_length": 1853.0, "completions/min_terminated_length": 1853.0, "entropy": 0.4114760532975197, "epoch": 0.14168765743073047, "frac_reward_zero_std": 0.0, "grad_norm": 0.09613590082252033, "kl": 0.002309023402631283, "learning_rate": 9.517034689138124e-07, "loss": -0.0596, "num_tokens": 17569077.0, "reward": 2.0833334922790527, "reward_std": 0.5970091223716736, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999053478240967, "sampling/importance_sampling_ratio/min": 0.5093384385108948, "sampling/sampling_logp_difference/max": 0.8538808822631836, "sampling/sampling_logp_difference/mean": 0.010691076517105103, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5738.0, "completions/max_terminated_length": 5738.0, "completions/mean_length": 2616.70849609375, "completions/mean_terminated_length": 2616.70849609375, "completions/min_length": 1241.0, "completions/min_terminated_length": 1241.0, "entropy": 0.24342621490359306, "epoch": 0.14231738035264482, "frac_reward_zero_std": 0.0, "grad_norm": 0.19991859939631415, "kl": 0.002302657288964838, "learning_rate": 9.512784462891979e-07, "loss": -0.0718, "num_tokens": 17652294.0, "reward": 2.7083334922790527, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.786931037902832, "sampling/importance_sampling_ratio/mean": 0.9998667240142822, "sampling/importance_sampling_ratio/min": 0.36214014887809753, "sampling/sampling_logp_difference/max": 1.0157239437103271, "sampling/sampling_logp_difference/mean": 0.006984020583331585, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7701.0, "completions/max_terminated_length": 7701.0, "completions/mean_length": 2300.666748046875, "completions/mean_terminated_length": 2300.666748046875, "completions/min_length": 1014.0, "completions/min_terminated_length": 1014.0, "entropy": 0.34226129949092865, "epoch": 0.1429471032745592, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09507319285975808, "kl": 0.0025999860954470932, "learning_rate": 9.508516574510872e-07, "loss": 0.0595, "num_tokens": 17722366.0, "reward": 2.6666667461395264, "reward_std": 0.49601587653160095, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4418771266937256, "sampling/importance_sampling_ratio/mean": 0.9997963905334473, "sampling/importance_sampling_ratio/min": 0.6931360363960266, "sampling/sampling_logp_difference/max": 0.3665289878845215, "sampling/sampling_logp_difference/mean": 0.009274821728467941, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6794.0, "completions/max_terminated_length": 6794.0, "completions/mean_length": 2841.875, "completions/mean_terminated_length": 2841.875, "completions/min_length": 1245.0, "completions/min_terminated_length": 1245.0, "entropy": 0.4558505564928055, "epoch": 0.14357682619647355, "frac_reward_zero_std": 0.0, "grad_norm": 0.10226759082148924, "kl": 0.0024618423776701093, "learning_rate": 9.504231040698461e-07, "loss": -0.1121, "num_tokens": 17800507.0, "reward": 2.625, "reward_std": 0.6542876362800598, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4329988956451416, "sampling/importance_sampling_ratio/mean": 1.0000125169754028, "sampling/importance_sampling_ratio/min": 0.6960609555244446, "sampling/sampling_logp_difference/max": 0.3623180389404297, "sampling/sampling_logp_difference/mean": 0.011089938692748547, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4439.0, "completions/max_terminated_length": 4439.0, "completions/mean_length": 2660.95849609375, "completions/mean_terminated_length": 2660.95849609375, "completions/min_length": 799.0, "completions/min_terminated_length": 799.0, "entropy": 0.4012398347258568, "epoch": 0.1442065491183879, "frac_reward_zero_std": 0.0, "grad_norm": 0.09573855635180067, "kl": 0.002832598111126572, "learning_rate": 9.499927878227471e-07, "loss": 0.007, "num_tokens": 17877322.0, "reward": 2.1666667461395264, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4488199949264526, "sampling/importance_sampling_ratio/mean": 0.9998981356620789, "sampling/importance_sampling_ratio/min": 0.33303239941596985, "sampling/sampling_logp_difference/max": 1.099515438079834, "sampling/sampling_logp_difference/mean": 0.009662531316280365, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6047.0, "completions/mean_length": 4009.20849609375, "completions/mean_terminated_length": 3827.347900390625, "completions/min_length": 1717.0, "completions/min_terminated_length": 1717.0, "entropy": 0.3697197288274765, "epoch": 0.14483627204030228, "frac_reward_zero_std": 0.0, "grad_norm": 0.10820180528429069, "kl": 0.002096261945553124, "learning_rate": 9.495607103939616e-07, "loss": 0.136, "num_tokens": 18015559.0, "reward": 1.9583333730697632, "reward_std": 0.8042440414428711, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6891673803329468, "sampling/importance_sampling_ratio/mean": 1.0000245571136475, "sampling/importance_sampling_ratio/min": 0.5823068022727966, "sampling/sampling_logp_difference/max": 0.5407578945159912, "sampling/sampling_logp_difference/mean": 0.009945766068994999, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5474.0, "completions/max_terminated_length": 5474.0, "completions/mean_length": 2991.45849609375, "completions/mean_terminated_length": 2991.45849609375, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.44228701293468475, "epoch": 0.14546599496221663, "frac_reward_zero_std": 0.0, "grad_norm": 0.09206324615124369, "kl": 0.002392296737525612, "learning_rate": 9.491268734745544e-07, "loss": -0.0346, "num_tokens": 18112306.0, "reward": 2.0833334922790527, "reward_std": 0.6288648843765259, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.818986415863037, "sampling/importance_sampling_ratio/mean": 1.0000826120376587, "sampling/importance_sampling_ratio/min": 0.5582740306854248, "sampling/sampling_logp_difference/max": 0.5982794761657715, "sampling/sampling_logp_difference/mean": 0.01082410104572773, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5757.0, "completions/max_terminated_length": 5757.0, "completions/mean_length": 2995.75, "completions/mean_terminated_length": 2995.75, "completions/min_length": 1141.0, "completions/min_terminated_length": 1141.0, "entropy": 0.426316998898983, "epoch": 0.14609571788413098, "frac_reward_zero_std": 0.0, "grad_norm": 0.10516847246260645, "kl": 0.0025464942445978522, "learning_rate": 9.486912787624759e-07, "loss": -0.0381, "num_tokens": 18201084.0, "reward": 2.2916667461395264, "reward_std": 0.6389504671096802, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.923194169998169, "sampling/importance_sampling_ratio/mean": 0.9999963641166687, "sampling/importance_sampling_ratio/min": 0.5688611268997192, "sampling/sampling_logp_difference/max": 0.6539874076843262, "sampling/sampling_logp_difference/mean": 0.010380967520177364, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6601.0, "completions/max_terminated_length": 6601.0, "completions/mean_length": 2646.33349609375, "completions/mean_terminated_length": 2646.33349609375, "completions/min_length": 911.0, "completions/min_terminated_length": 911.0, "entropy": 0.44922342151403427, "epoch": 0.14672544080604533, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12629900080072248, "kl": 0.0024384339340031147, "learning_rate": 9.482539279625571e-07, "loss": -0.1362, "num_tokens": 18275372.0, "reward": 2.125, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.598039984703064, "sampling/importance_sampling_ratio/mean": 0.9999645352363586, "sampling/importance_sampling_ratio/min": 0.4792057275772095, "sampling/sampling_logp_difference/max": 0.7356252670288086, "sampling/sampling_logp_difference/mean": 0.01090584509074688, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3739.0, "completions/max_terminated_length": 3739.0, "completions/mean_length": 2422.416748046875, "completions/mean_terminated_length": 2422.416748046875, "completions/min_length": 848.0, "completions/min_terminated_length": 848.0, "entropy": 0.2836337983608246, "epoch": 0.1473551637279597, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08445650827931839, "kl": 0.002577103383373469, "learning_rate": 9.478148227865013e-07, "loss": -0.0664, "num_tokens": 18378494.0, "reward": 2.3333334922790527, "reward_std": 0.30860671401023865, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.818173885345459, "sampling/importance_sampling_ratio/mean": 1.0000070333480835, "sampling/importance_sampling_ratio/min": 0.332999050617218, "sampling/sampling_logp_difference/max": 1.0996155738830566, "sampling/sampling_logp_difference/mean": 0.007572642527520657, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4706.0, "completions/max_terminated_length": 4706.0, "completions/mean_length": 2173.33349609375, "completions/mean_terminated_length": 2173.33349609375, "completions/min_length": 700.0, "completions/min_terminated_length": 700.0, "entropy": 0.33542416244745255, "epoch": 0.14798488664987405, "frac_reward_zero_std": 0.0, "grad_norm": 0.10347688254459607, "kl": 0.002762621035799384, "learning_rate": 9.47373964952878e-07, "loss": -0.0818, "num_tokens": 18453470.0, "reward": 2.5, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.555846095085144, "sampling/importance_sampling_ratio/mean": 1.000167727470398, "sampling/importance_sampling_ratio/min": 0.25776469707489014, "sampling/sampling_logp_difference/max": 1.355708122253418, "sampling/sampling_logp_difference/mean": 0.009015226736664772, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4305.0, "completions/max_terminated_length": 4305.0, "completions/mean_length": 2045.041748046875, "completions/mean_terminated_length": 2045.041748046875, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "entropy": 0.2739229388535023, "epoch": 0.1486146095717884, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08683702570662988, "kl": 0.0026264566113241017, "learning_rate": 9.469313561871169e-07, "loss": -0.0474, "num_tokens": 18513687.0, "reward": 2.8333334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5836118459701538, "sampling/importance_sampling_ratio/mean": 0.9998219609260559, "sampling/importance_sampling_ratio/min": 0.2062067836523056, "sampling/sampling_logp_difference/max": 1.5788757801055908, "sampling/sampling_logp_difference/mean": 0.007396630011498928, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4789.0, "completions/max_terminated_length": 4789.0, "completions/mean_length": 1674.791748046875, "completions/mean_terminated_length": 1674.791748046875, "completions/min_length": 694.0, "completions/min_terminated_length": 694.0, "entropy": 0.3460068553686142, "epoch": 0.14924433249370278, "frac_reward_zero_std": 0.0, "grad_norm": 0.10077903449146093, "kl": 0.003294173104222864, "learning_rate": 9.464869982215e-07, "loss": -0.1073, "num_tokens": 18567186.0, "reward": 2.7916667461395264, "reward_std": 0.42645785212516785, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.695945382118225, "sampling/importance_sampling_ratio/mean": 0.9999930262565613, "sampling/importance_sampling_ratio/min": 0.6966350674629211, "sampling/sampling_logp_difference/max": 0.5282403230667114, "sampling/sampling_logp_difference/mean": 0.008786879479885101, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6532.0, "completions/max_terminated_length": 6532.0, "completions/mean_length": 2699.70849609375, "completions/mean_terminated_length": 2699.70849609375, "completions/min_length": 508.0, "completions/min_terminated_length": 508.0, "entropy": 0.33864038437604904, "epoch": 0.14987405541561713, "frac_reward_zero_std": 0.0, "grad_norm": 0.08771368518123851, "kl": 0.0025880744215101004, "learning_rate": 9.460408927951551e-07, "loss": -0.0864, "num_tokens": 18648251.0, "reward": 2.0416667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6639413833618164, "sampling/importance_sampling_ratio/mean": 0.9999977946281433, "sampling/importance_sampling_ratio/min": 0.5239130258560181, "sampling/sampling_logp_difference/max": 0.6464295387268066, "sampling/sampling_logp_difference/mean": 0.008928337134420872, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2867.0, "completions/max_terminated_length": 2867.0, "completions/mean_length": 1667.75, "completions/mean_terminated_length": 1667.75, "completions/min_length": 826.0, "completions/min_terminated_length": 826.0, "entropy": 0.2847708761692047, "epoch": 0.15050377833753148, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06719380019532657, "kl": 0.002988588239531964, "learning_rate": 9.4559304165405e-07, "loss": 0.0083, "num_tokens": 18704189.0, "reward": 2.9583334922790527, "reward_std": 0.1178511306643486, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999975860118866, "sampling/importance_sampling_ratio/min": 0.6198402047157288, "sampling/sampling_logp_difference/max": 0.7764995098114014, "sampling/sampling_logp_difference/mean": 0.007405862212181091, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7440.0, "completions/mean_length": 3590.20849609375, "completions/mean_terminated_length": 2932.8095703125, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "entropy": 0.44532521814107895, "epoch": 0.15113350125944586, "frac_reward_zero_std": 0.0, "grad_norm": 0.09598628816193934, "kl": 0.0022392612299881876, "learning_rate": 9.451434465509841e-07, "loss": 0.0391, "num_tokens": 18813602.0, "reward": 2.0, "reward_std": 0.7617236375808716, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000163197517395, "sampling/importance_sampling_ratio/min": 0.49639931321144104, "sampling/sampling_logp_difference/max": 1.0540542602539062, "sampling/sampling_logp_difference/mean": 0.01088710781186819, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5287.0, "completions/mean_length": 3263.75, "completions/mean_terminated_length": 3049.478271484375, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "entropy": 0.4177509844303131, "epoch": 0.1517632241813602, "frac_reward_zero_std": 0.0, "grad_norm": 0.08073312882994532, "kl": 0.0028395375120453537, "learning_rate": 9.446921092455826e-07, "loss": -0.0427, "num_tokens": 18905388.0, "reward": 2.0, "reward_std": 0.6279458999633789, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.730062484741211, "sampling/importance_sampling_ratio/mean": 1.0002187490463257, "sampling/importance_sampling_ratio/min": 0.5186716914176941, "sampling/sampling_logp_difference/max": 0.6564841270446777, "sampling/sampling_logp_difference/mean": 0.010219919495284557, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5663.0, "completions/max_terminated_length": 5663.0, "completions/mean_length": 1972.291748046875, "completions/mean_terminated_length": 1972.291748046875, "completions/min_length": 702.0, "completions/min_terminated_length": 702.0, "entropy": 0.37152474373579025, "epoch": 0.15239294710327456, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11933887261649241, "kl": 0.003332946973387152, "learning_rate": 9.442390315042897e-07, "loss": -0.0866, "num_tokens": 18963355.0, "reward": 2.375, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6368534564971924, "sampling/importance_sampling_ratio/mean": 0.9998989701271057, "sampling/importance_sampling_ratio/min": 0.6149726510047913, "sampling/sampling_logp_difference/max": 0.49277573823928833, "sampling/sampling_logp_difference/mean": 0.009664828889071941, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3592.0, "completions/max_terminated_length": 3592.0, "completions/mean_length": 1389.2083740234375, "completions/mean_terminated_length": 1389.2083740234375, "completions/min_length": 502.0, "completions/min_terminated_length": 502.0, "entropy": 0.28401051461696625, "epoch": 0.1530226700251889, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.04977703949277074, "kl": 0.003453766054008156, "learning_rate": 9.437842151003607e-07, "loss": -0.022, "num_tokens": 19005400.0, "reward": 2.9583334922790527, "reward_std": 0.1178511306643486, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2625361680984497, "sampling/importance_sampling_ratio/mean": 1.0001364946365356, "sampling/importance_sampling_ratio/min": 0.6978968381881714, "sampling/sampling_logp_difference/max": 0.3596839904785156, "sampling/sampling_logp_difference/mean": 0.007302968762814999, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8005.0, "completions/max_terminated_length": 8005.0, "completions/mean_length": 2986.541748046875, "completions/mean_terminated_length": 2986.541748046875, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "entropy": 0.49820248782634735, "epoch": 0.15365239294710328, "frac_reward_zero_std": 0.0, "grad_norm": 0.12137239416720787, "kl": 0.003312404209282249, "learning_rate": 9.433276618138559e-07, "loss": -0.0106, "num_tokens": 19085757.0, "reward": 2.0416667461395264, "reward_std": 0.6628212928771973, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6089375019073486, "sampling/importance_sampling_ratio/mean": 0.9998688697814941, "sampling/importance_sampling_ratio/min": 0.39611220359802246, "sampling/sampling_logp_difference/max": 0.9260578155517578, "sampling/sampling_logp_difference/mean": 0.012059131637215614, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7840.0, "completions/max_terminated_length": 7840.0, "completions/mean_length": 2865.95849609375, "completions/mean_terminated_length": 2865.95849609375, "completions/min_length": 621.0, "completions/min_terminated_length": 621.0, "entropy": 0.4804942458868027, "epoch": 0.15428211586901763, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12411159766026461, "kl": 0.0036256437888368964, "learning_rate": 9.428693734316337e-07, "loss": 0.0201, "num_tokens": 19165004.0, "reward": 2.375, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999273419380188, "sampling/importance_sampling_ratio/min": 0.38544604182243347, "sampling/sampling_logp_difference/max": 0.9533540606498718, "sampling/sampling_logp_difference/mean": 0.012176094576716423, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5146.0, "completions/max_terminated_length": 5146.0, "completions/mean_length": 2601.95849609375, "completions/mean_terminated_length": 2601.95849609375, "completions/min_length": 1060.0, "completions/min_terminated_length": 1060.0, "entropy": 0.3966900035738945, "epoch": 0.15491183879093198, "frac_reward_zero_std": 0.0, "grad_norm": 0.1120995221535474, "kl": 0.0027765120612457395, "learning_rate": 9.424093517473428e-07, "loss": 0.0055, "num_tokens": 19247203.0, "reward": 2.5833334922790527, "reward_std": 0.5440332293510437, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7771852016448975, "sampling/importance_sampling_ratio/mean": 0.9999101758003235, "sampling/importance_sampling_ratio/min": 0.452383816242218, "sampling/sampling_logp_difference/max": 0.7932243347167969, "sampling/sampling_logp_difference/mean": 0.010172965936362743, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6731.0, "completions/mean_length": 4655.2919921875, "completions/mean_terminated_length": 4333.77294921875, "completions/min_length": 1652.0, "completions/min_terminated_length": 1652.0, "entropy": 0.5175693854689598, "epoch": 0.15554156171284636, "frac_reward_zero_std": 0.0, "grad_norm": 0.10170041658232039, "kl": 0.002494252461474389, "learning_rate": 9.419475985614163e-07, "loss": 0.205, "num_tokens": 19372058.0, "reward": 2.125, "reward_std": 0.9141364097595215, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999351501464844, "sampling/importance_sampling_ratio/min": 0.42936116456985474, "sampling/sampling_logp_difference/max": 0.8454568386077881, "sampling/sampling_logp_difference/mean": 0.01316086109727621, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3475.0, "completions/max_terminated_length": 3475.0, "completions/mean_length": 1999.666748046875, "completions/mean_terminated_length": 1999.666748046875, "completions/min_length": 53.0, "completions/min_terminated_length": 53.0, "entropy": 0.3259703814983368, "epoch": 0.1561712846347607, "frac_reward_zero_std": 0.0, "grad_norm": 0.13500928321114722, "kl": 0.003436160390265286, "learning_rate": 9.414841156810634e-07, "loss": -0.1086, "num_tokens": 19447658.0, "reward": 2.125, "reward_std": 0.7532514333724976, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5793201923370361, "sampling/importance_sampling_ratio/mean": 1.0000940561294556, "sampling/importance_sampling_ratio/min": 0.645008385181427, "sampling/sampling_logp_difference/max": 0.45699453353881836, "sampling/sampling_logp_difference/mean": 0.0074465638026595116, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4152.0, "completions/mean_length": 2429.70849609375, "completions/mean_terminated_length": 2179.174072265625, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.40007439255714417, "epoch": 0.15680100755667506, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.13963717231595627, "kl": 0.0028646300197578967, "learning_rate": 9.410189049202634e-07, "loss": 0.2758, "num_tokens": 19513843.0, "reward": 2.5416667461395264, "reward_std": 0.5699716806411743, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.462408185005188, "sampling/importance_sampling_ratio/mean": 1.0001999139785767, "sampling/importance_sampling_ratio/min": 0.5823997855186462, "sampling/sampling_logp_difference/max": 0.5405981540679932, "sampling/sampling_logp_difference/mean": 0.010421785525977612, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4131.0, "completions/max_terminated_length": 4131.0, "completions/mean_length": 2142.5, "completions/mean_terminated_length": 2142.5, "completions/min_length": 1097.0, "completions/min_terminated_length": 1097.0, "entropy": 0.34867677092552185, "epoch": 0.1574307304785894, "frac_reward_zero_std": 0.0, "grad_norm": 0.09612032415075975, "kl": 0.0030607349472120404, "learning_rate": 9.405519680997584e-07, "loss": -0.0629, "num_tokens": 19591695.0, "reward": 2.375, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4351998567581177, "sampling/importance_sampling_ratio/mean": 1.000169038772583, "sampling/importance_sampling_ratio/min": 0.6114693880081177, "sampling/sampling_logp_difference/max": 0.49189043045043945, "sampling/sampling_logp_difference/mean": 0.008358979597687721, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5602.0, "completions/max_terminated_length": 5602.0, "completions/mean_length": 3384.125, "completions/mean_terminated_length": 3384.125, "completions/min_length": 1031.0, "completions/min_terminated_length": 1031.0, "entropy": 0.47110363095998764, "epoch": 0.1580604534005038, "frac_reward_zero_std": 0.0, "grad_norm": 0.09700325769254953, "kl": 0.0028567067347466946, "learning_rate": 9.400833070470453e-07, "loss": 0.0309, "num_tokens": 19687978.0, "reward": 2.375, "reward_std": 0.654287576675415, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5002213716506958, "sampling/importance_sampling_ratio/mean": 0.9999005794525146, "sampling/importance_sampling_ratio/min": 0.43091249465942383, "sampling/sampling_logp_difference/max": 0.8418502807617188, "sampling/sampling_logp_difference/mean": 0.01155785284936428, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5215.0, "completions/max_terminated_length": 5215.0, "completions/mean_length": 2727.125, "completions/mean_terminated_length": 2727.125, "completions/min_length": 1011.0, "completions/min_terminated_length": 1011.0, "entropy": 0.33107326552271843, "epoch": 0.15869017632241814, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09202052703747501, "kl": 0.0027409950853325427, "learning_rate": 9.396129235963698e-07, "loss": 0.0157, "num_tokens": 19770933.0, "reward": 2.4583334922790527, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.99983811378479, "sampling/importance_sampling_ratio/min": 0.43655925989151, "sampling/sampling_logp_difference/max": 1.7793374061584473, "sampling/sampling_logp_difference/mean": 0.00862819142639637, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5623.0, "completions/mean_length": 3158.20849609375, "completions/mean_terminated_length": 2939.347900390625, "completions/min_length": 1240.0, "completions/min_terminated_length": 1240.0, "entropy": 0.44888950884342194, "epoch": 0.15931989924433249, "frac_reward_zero_std": 0.0, "grad_norm": 0.10321465883507569, "kl": 0.0028029277455061674, "learning_rate": 9.391408195887185e-07, "loss": 0.1557, "num_tokens": 19859610.0, "reward": 2.4166667461395264, "reward_std": 0.6900655627250671, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000486373901367, "sampling/importance_sampling_ratio/min": 0.6142109632492065, "sampling/sampling_logp_difference/max": 0.9069666862487793, "sampling/sampling_logp_difference/mean": 0.011366995051503181, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7580.0, "completions/max_terminated_length": 7580.0, "completions/mean_length": 2969.791748046875, "completions/mean_terminated_length": 2969.791748046875, "completions/min_length": 1142.0, "completions/min_terminated_length": 1142.0, "entropy": 0.3928290829062462, "epoch": 0.15994962216624686, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0913116692787671, "kl": 0.002966663218103349, "learning_rate": 9.386669968718118e-07, "loss": -0.0014, "num_tokens": 19941949.0, "reward": 2.2916667461395264, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.536027431488037, "sampling/importance_sampling_ratio/mean": 1.000032901763916, "sampling/importance_sampling_ratio/min": 0.5673006176948547, "sampling/sampling_logp_difference/max": 0.5668659210205078, "sampling/sampling_logp_difference/mean": 0.010226388461887836, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3047.0, "completions/max_terminated_length": 3047.0, "completions/mean_length": 1756.9583740234375, "completions/mean_terminated_length": 1756.9583740234375, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.428557813167572, "epoch": 0.1605793450881612, "frac_reward_zero_std": 0.0, "grad_norm": 0.11948030070349544, "kl": 0.003438943065702915, "learning_rate": 9.381914573000973e-07, "loss": 0.0227, "num_tokens": 19996940.0, "reward": 2.375, "reward_std": 0.5586560964584351, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4659514427185059, "sampling/importance_sampling_ratio/mean": 0.9999725222587585, "sampling/importance_sampling_ratio/min": 0.5744006633758545, "sampling/sampling_logp_difference/max": 0.554428219795227, "sampling/sampling_logp_difference/mean": 0.00954129733145237, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4227.0, "completions/max_terminated_length": 4227.0, "completions/mean_length": 2039.4583740234375, "completions/mean_terminated_length": 2039.4583740234375, "completions/min_length": 784.0, "completions/min_terminated_length": 784.0, "entropy": 0.3637930303812027, "epoch": 0.16120906801007556, "frac_reward_zero_std": 0.0, "grad_norm": 0.10198979568804324, "kl": 0.0029985428554937243, "learning_rate": 9.377142027347414e-07, "loss": 0.0715, "num_tokens": 20058943.0, "reward": 2.5833334922790527, "reward_std": 0.4857778251171112, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6254311800003052, "sampling/importance_sampling_ratio/mean": 1.0002200603485107, "sampling/importance_sampling_ratio/min": 0.6766709685325623, "sampling/sampling_logp_difference/max": 0.48577308654785156, "sampling/sampling_logp_difference/mean": 0.00864022970199585, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6332.0, "completions/max_terminated_length": 6332.0, "completions/mean_length": 2891.45849609375, "completions/mean_terminated_length": 2891.45849609375, "completions/min_length": 837.0, "completions/min_terminated_length": 837.0, "entropy": 0.46749477833509445, "epoch": 0.1618387909319899, "frac_reward_zero_std": 0.0, "grad_norm": 0.10235455863906687, "kl": 0.0030903922161087394, "learning_rate": 9.37235235043623e-07, "loss": -0.0193, "num_tokens": 20141514.0, "reward": 2.25, "reward_std": 0.6317276954650879, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5364636182785034, "sampling/importance_sampling_ratio/mean": 1.000015377998352, "sampling/importance_sampling_ratio/min": 0.5344516634941101, "sampling/sampling_logp_difference/max": 0.6265139579772949, "sampling/sampling_logp_difference/mean": 0.011215797625482082, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5017.0, "completions/max_terminated_length": 5017.0, "completions/mean_length": 2539.041748046875, "completions/mean_terminated_length": 2539.041748046875, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "entropy": 0.35795557498931885, "epoch": 0.1624685138539043, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0924502445700443, "kl": 0.0033256360911764205, "learning_rate": 9.367545561013259e-07, "loss": -0.0519, "num_tokens": 20216411.0, "reward": 2.5416667461395264, "reward_std": 0.3917974829673767, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8524569272994995, "sampling/importance_sampling_ratio/mean": 1.0002847909927368, "sampling/importance_sampling_ratio/min": 0.6738995313644409, "sampling/sampling_logp_difference/max": 0.6165127754211426, "sampling/sampling_logp_difference/mean": 0.009134840220212936, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5896.0, "completions/max_terminated_length": 5896.0, "completions/mean_length": 3044.45849609375, "completions/mean_terminated_length": 3044.45849609375, "completions/min_length": 1417.0, "completions/min_terminated_length": 1417.0, "entropy": 0.5457055270671844, "epoch": 0.16309823677581864, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07936960265413011, "kl": 0.0031650869641453028, "learning_rate": 9.362721677891311e-07, "loss": 0.0346, "num_tokens": 20299366.0, "reward": 2.5833334922790527, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7547621726989746, "sampling/importance_sampling_ratio/mean": 0.9999653697013855, "sampling/importance_sampling_ratio/min": 0.22173139452934265, "sampling/sampling_logp_difference/max": 1.5062885284423828, "sampling/sampling_logp_difference/mean": 0.012953594326972961, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6639.0, "completions/mean_length": 3617.95849609375, "completions/mean_terminated_length": 3419.0869140625, "completions/min_length": 1261.0, "completions/min_terminated_length": 1261.0, "entropy": 0.5397490039467812, "epoch": 0.163727959697733, "frac_reward_zero_std": 0.0, "grad_norm": 0.08871232201471732, "kl": 0.0027636014856398106, "learning_rate": 9.3578807199501e-07, "loss": 0.1241, "num_tokens": 20395013.0, "reward": 2.3333334922790527, "reward_std": 0.6279458999633789, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000078558921814, "sampling/importance_sampling_ratio/min": 0.49666059017181396, "sampling/sampling_logp_difference/max": 0.8091678619384766, "sampling/sampling_logp_difference/mean": 0.012550398707389832, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3439.0, "completions/max_terminated_length": 3439.0, "completions/mean_length": 1906.666748046875, "completions/mean_terminated_length": 1906.666748046875, "completions/min_length": 834.0, "completions/min_terminated_length": 834.0, "entropy": 0.3929217532277107, "epoch": 0.16435768261964737, "frac_reward_zero_std": 0.0, "grad_norm": 0.11390703641537293, "kl": 0.0031025047064758837, "learning_rate": 9.353022706136164e-07, "loss": -0.0269, "num_tokens": 20451221.0, "reward": 2.625, "reward_std": 0.5049939155578613, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3425003290176392, "sampling/importance_sampling_ratio/mean": 1.0001126527786255, "sampling/importance_sampling_ratio/min": 0.7197207808494568, "sampling/sampling_logp_difference/max": 0.3288919925689697, "sampling/sampling_logp_difference/mean": 0.009577779099345207, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4982.0, "completions/max_terminated_length": 4982.0, "completions/mean_length": 3057.916748046875, "completions/mean_terminated_length": 3057.916748046875, "completions/min_length": 1888.0, "completions/min_terminated_length": 1888.0, "entropy": 0.3250752240419388, "epoch": 0.16498740554156172, "frac_reward_zero_std": 0.0, "grad_norm": 0.08846288500352936, "kl": 0.002468836843036115, "learning_rate": 9.348147655462799e-07, "loss": 0.0173, "num_tokens": 20546795.0, "reward": 2.0416667461395264, "reward_std": 0.695380687713623, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.8077483177185059, "sampling/importance_sampling_ratio/mean": 0.9998690485954285, "sampling/importance_sampling_ratio/min": 0.5745305418968201, "sampling/sampling_logp_difference/max": 0.5920820236206055, "sampling/sampling_logp_difference/mean": 0.008503550663590431, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6161.0, "completions/max_terminated_length": 6161.0, "completions/mean_length": 1960.7083740234375, "completions/mean_terminated_length": 1960.7083740234375, "completions/min_length": 750.0, "completions/min_terminated_length": 750.0, "entropy": 0.40827810019254684, "epoch": 0.16561712846347607, "frac_reward_zero_std": 0.0, "grad_norm": 0.11684702033016887, "kl": 0.004038955201394856, "learning_rate": 9.343255587009976e-07, "loss": 0.0399, "num_tokens": 20602412.0, "reward": 2.6666667461395264, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8102514743804932, "sampling/importance_sampling_ratio/mean": 0.999999463558197, "sampling/importance_sampling_ratio/min": 0.6718628406524658, "sampling/sampling_logp_difference/max": 0.5934658050537109, "sampling/sampling_logp_difference/mean": 0.009945794008672237, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6285.0, "completions/max_terminated_length": 6285.0, "completions/mean_length": 2936.375, "completions/mean_terminated_length": 2936.375, "completions/min_length": 752.0, "completions/min_terminated_length": 752.0, "entropy": 0.5458346679806709, "epoch": 0.16624685138539042, "frac_reward_zero_std": 0.0, "grad_norm": 0.10004616353456552, "kl": 0.003502722945995629, "learning_rate": 9.338346519924273e-07, "loss": -0.0317, "num_tokens": 20680557.0, "reward": 1.9583333730697632, "reward_std": 0.6380135416984558, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6509571075439453, "sampling/importance_sampling_ratio/mean": 0.9999284148216248, "sampling/importance_sampling_ratio/min": 0.5134274959564209, "sampling/sampling_logp_difference/max": 0.6666464805603027, "sampling/sampling_logp_difference/mean": 0.012908710166811943, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6688.0, "completions/max_terminated_length": 6688.0, "completions/mean_length": 2543.125, "completions/mean_terminated_length": 2543.125, "completions/min_length": 1234.0, "completions/min_terminated_length": 1234.0, "entropy": 0.4177079424262047, "epoch": 0.1668765743073048, "frac_reward_zero_std": 0.0, "grad_norm": 0.16574250169718913, "kl": 0.0032986286096274853, "learning_rate": 9.333420473418794e-07, "loss": 0.1181, "num_tokens": 20755664.0, "reward": 2.375, "reward_std": 0.7013019323348999, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.424229383468628, "sampling/importance_sampling_ratio/mean": 0.9998857975006104, "sampling/importance_sampling_ratio/min": 0.4236172139644623, "sampling/sampling_logp_difference/max": 0.8589251041412354, "sampling/sampling_logp_difference/mean": 0.010224140249192715, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5324.0, "completions/max_terminated_length": 5324.0, "completions/mean_length": 3112.916748046875, "completions/mean_terminated_length": 3112.916748046875, "completions/min_length": 69.0, "completions/min_terminated_length": 69.0, "entropy": 0.3268076404929161, "epoch": 0.16750629722921914, "frac_reward_zero_std": 0.0, "grad_norm": 0.06526766154170584, "kl": 0.002921302104368806, "learning_rate": 9.3284774667731e-07, "loss": -0.1093, "num_tokens": 20853814.0, "reward": 2.7916667461395264, "reward_std": 0.5078567266464233, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.557209849357605, "sampling/importance_sampling_ratio/mean": 0.999931812286377, "sampling/importance_sampling_ratio/min": 0.625821053981781, "sampling/sampling_logp_difference/max": 0.4686908721923828, "sampling/sampling_logp_difference/mean": 0.009094130247831345, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5185.0, "completions/mean_length": 2309.875, "completions/mean_terminated_length": 2054.13037109375, "completions/min_length": 598.0, "completions/min_terminated_length": 598.0, "entropy": 0.36087487637996674, "epoch": 0.1681360201511335, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.119158053405904, "kl": 0.003130676515866071, "learning_rate": 9.323517519333128e-07, "loss": 0.1463, "num_tokens": 20923059.0, "reward": 2.25, "reward_std": 0.38613972067832947, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5364519357681274, "sampling/importance_sampling_ratio/mean": 1.0001944303512573, "sampling/importance_sampling_ratio/min": 0.5832831263542175, "sampling/sampling_logp_difference/max": 0.5390825271606445, "sampling/sampling_logp_difference/mean": 0.009321833029389381, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7102.0, "completions/max_terminated_length": 7102.0, "completions/mean_length": 3401.83349609375, "completions/mean_terminated_length": 3401.83349609375, "completions/min_length": 1654.0, "completions/min_terminated_length": 1654.0, "entropy": 0.43557919561862946, "epoch": 0.16876574307304787, "frac_reward_zero_std": 0.0, "grad_norm": 0.0794356090800084, "kl": 0.002846263174433261, "learning_rate": 9.318540650511122e-07, "loss": -0.0384, "num_tokens": 21023943.0, "reward": 2.625, "reward_std": 0.5383754968643188, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001208782196045, "sampling/importance_sampling_ratio/min": 0.5447903871536255, "sampling/sampling_logp_difference/max": 1.5233945846557617, "sampling/sampling_logp_difference/mean": 0.01136515848338604, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3908.0, "completions/max_terminated_length": 3908.0, "completions/mean_length": 1666.3333740234375, "completions/mean_terminated_length": 1666.3333740234375, "completions/min_length": 679.0, "completions/min_terminated_length": 679.0, "entropy": 0.28372134268283844, "epoch": 0.16939546599496222, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11474083098355478, "kl": 0.003859751857817173, "learning_rate": 9.313546879785549e-07, "loss": 0.0038, "num_tokens": 21082631.0, "reward": 2.5416667461395264, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5871385335922241, "sampling/importance_sampling_ratio/mean": 1.000064730644226, "sampling/importance_sampling_ratio/min": 0.6811763644218445, "sampling/sampling_logp_difference/max": 0.4619326591491699, "sampling/sampling_logp_difference/mean": 0.007679393049329519, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6903.0, "completions/max_terminated_length": 6903.0, "completions/mean_length": 3196.125, "completions/mean_terminated_length": 3196.125, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "entropy": 0.4496636241674423, "epoch": 0.17002518891687657, "frac_reward_zero_std": 0.0, "grad_norm": 0.09440559766121373, "kl": 0.0031007807701826096, "learning_rate": 9.308536226701027e-07, "loss": -0.0412, "num_tokens": 21171314.0, "reward": 2.4583334922790527, "reward_std": 0.5787960886955261, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999005198478699, "sampling/importance_sampling_ratio/min": 0.4840778112411499, "sampling/sampling_logp_difference/max": 0.7793235778808594, "sampling/sampling_logp_difference/mean": 0.011576938442885876, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8082.0, "completions/max_terminated_length": 8082.0, "completions/mean_length": 3699.291748046875, "completions/mean_terminated_length": 3699.291748046875, "completions/min_length": 1982.0, "completions/min_terminated_length": 1982.0, "entropy": 0.39879296720027924, "epoch": 0.17065491183879095, "frac_reward_zero_std": 0.0, "grad_norm": 0.08693780362441085, "kl": 0.00273020809981972, "learning_rate": 9.30350871086825e-07, "loss": 0.0669, "num_tokens": 21289993.0, "reward": 1.9166667461395264, "reward_std": 0.4993361234664917, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000799894332886, "sampling/importance_sampling_ratio/min": 0.4807453453540802, "sampling/sampling_logp_difference/max": 0.7605910301208496, "sampling/sampling_logp_difference/mean": 0.010686988942325115, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5526.0, "completions/max_terminated_length": 5526.0, "completions/mean_length": 2618.416748046875, "completions/mean_terminated_length": 2618.416748046875, "completions/min_length": 1253.0, "completions/min_terminated_length": 1253.0, "entropy": 0.3829045444726944, "epoch": 0.1712846347607053, "frac_reward_zero_std": 0.0, "grad_norm": 0.08943441705591408, "kl": 0.003176437341608107, "learning_rate": 9.298464351963907e-07, "loss": 0.0435, "num_tokens": 21362763.0, "reward": 2.4166667461395264, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6432198286056519, "sampling/importance_sampling_ratio/mean": 1.0000616312026978, "sampling/importance_sampling_ratio/min": 0.2950500249862671, "sampling/sampling_logp_difference/max": 1.2206103801727295, "sampling/sampling_logp_difference/mean": 0.00941160786896944, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4321.0, "completions/max_terminated_length": 4321.0, "completions/mean_length": 2352.125, "completions/mean_terminated_length": 2352.125, "completions/min_length": 1209.0, "completions/min_terminated_length": 1209.0, "entropy": 0.23308371752500534, "epoch": 0.17191435768261965, "frac_reward_zero_std": 0.0, "grad_norm": 0.10877629818444497, "kl": 0.0027265629614703357, "learning_rate": 9.293403169730611e-07, "loss": 0.0023, "num_tokens": 21443286.0, "reward": 2.6666667461395264, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000231266021729, "sampling/importance_sampling_ratio/min": 0.43289482593536377, "sampling/sampling_logp_difference/max": 1.4493303298950195, "sampling/sampling_logp_difference/mean": 0.006542433053255081, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3491.0, "completions/max_terminated_length": 3491.0, "completions/mean_length": 1579.0833740234375, "completions/mean_terminated_length": 1579.0833740234375, "completions/min_length": 605.0, "completions/min_terminated_length": 605.0, "entropy": 0.3352260887622833, "epoch": 0.172544080604534, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07282464597465801, "kl": 0.0034898685989901423, "learning_rate": 9.288325183976817e-07, "loss": -0.1371, "num_tokens": 21493720.0, "reward": 2.875, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.456268548965454, "sampling/importance_sampling_ratio/mean": 0.9998555779457092, "sampling/importance_sampling_ratio/min": 0.7068564891815186, "sampling/sampling_logp_difference/max": 0.37587738037109375, "sampling/sampling_logp_difference/mean": 0.008809930644929409, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7821.0, "completions/max_terminated_length": 7821.0, "completions/mean_length": 4531.5419921875, "completions/mean_terminated_length": 4531.5419921875, "completions/min_length": 2035.0, "completions/min_terminated_length": 2035.0, "entropy": 0.5095522776246071, "epoch": 0.17317380352644837, "frac_reward_zero_std": 0.0, "grad_norm": 0.09226091664130444, "kl": 0.002865921414922923, "learning_rate": 9.28323041457674e-07, "loss": 0.1251, "num_tokens": 21613589.0, "reward": 2.2083334922790527, "reward_std": 0.6274997591972351, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4809354543685913, "sampling/importance_sampling_ratio/mean": 0.9999087452888489, "sampling/importance_sampling_ratio/min": 0.4812960624694824, "sampling/sampling_logp_difference/max": 0.7312726974487305, "sampling/sampling_logp_difference/mean": 0.012314742431044579, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6911.0, "completions/mean_length": 3951.33349609375, "completions/mean_terminated_length": 3766.95654296875, "completions/min_length": 1713.0, "completions/min_terminated_length": 1713.0, "entropy": 0.4718901813030243, "epoch": 0.17380352644836272, "frac_reward_zero_std": 0.0, "grad_norm": 0.10035555758744591, "kl": 0.002926699467934668, "learning_rate": 9.278118881470291e-07, "loss": 0.026, "num_tokens": 21723973.0, "reward": 2.0833334922790527, "reward_std": 0.8410486578941345, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002729892730713, "sampling/importance_sampling_ratio/min": 0.050194595009088516, "sampling/sampling_logp_difference/max": 2.9918479919433594, "sampling/sampling_logp_difference/mean": 0.011778600513935089, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3368.0, "completions/max_terminated_length": 3368.0, "completions/mean_length": 1614.041748046875, "completions/mean_terminated_length": 1614.041748046875, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.4687598794698715, "epoch": 0.17443324937027707, "frac_reward_zero_std": 0.0, "grad_norm": 0.1224438400884761, "kl": 0.0037626203848049045, "learning_rate": 9.272990604662987e-07, "loss": -0.1635, "num_tokens": 21773926.0, "reward": 2.1666667461395264, "reward_std": 0.46854168176651, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4081443548202515, "sampling/importance_sampling_ratio/mean": 0.9998109936714172, "sampling/importance_sampling_ratio/min": 0.6097069978713989, "sampling/sampling_logp_difference/max": 0.49477672576904297, "sampling/sampling_logp_difference/mean": 0.011080553755164146, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6302.0, "completions/mean_length": 3148.125, "completions/mean_terminated_length": 2928.826171875, "completions/min_length": 1536.0, "completions/min_terminated_length": 1536.0, "entropy": 0.4273615926504135, "epoch": 0.17506297229219145, "frac_reward_zero_std": 0.0, "grad_norm": 0.12469248635522195, "kl": 0.0025447277002967894, "learning_rate": 9.267845604225877e-07, "loss": 0.1937, "num_tokens": 21871833.0, "reward": 2.5416667461395264, "reward_std": 0.580761194229126, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999575018882751, "sampling/importance_sampling_ratio/min": 0.643458366394043, "sampling/sampling_logp_difference/max": 0.7489335536956787, "sampling/sampling_logp_difference/mean": 0.010901985689997673, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6699.0, "completions/max_terminated_length": 6699.0, "completions/mean_length": 3447.08349609375, "completions/mean_terminated_length": 3447.08349609375, "completions/min_length": 1626.0, "completions/min_terminated_length": 1626.0, "entropy": 0.3680751249194145, "epoch": 0.1756926952141058, "frac_reward_zero_std": 0.0, "grad_norm": 0.08541669822873525, "kl": 0.0025638877414166927, "learning_rate": 9.262683900295462e-07, "loss": -0.0809, "num_tokens": 21970355.0, "reward": 2.375, "reward_std": 0.6618843674659729, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000791549682617, "sampling/importance_sampling_ratio/min": 0.3148638904094696, "sampling/sampling_logp_difference/max": 1.1556148529052734, "sampling/sampling_logp_difference/mean": 0.009112433530390263, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7323.0, "completions/max_terminated_length": 7323.0, "completions/mean_length": 2829.125, "completions/mean_terminated_length": 2829.125, "completions/min_length": 1312.0, "completions/min_terminated_length": 1312.0, "entropy": 0.3200450763106346, "epoch": 0.17632241813602015, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08715773219221286, "kl": 0.002875084464903921, "learning_rate": 9.257505513073618e-07, "loss": -0.0649, "num_tokens": 22062014.0, "reward": 2.375, "reward_std": 0.4244926869869232, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4548461437225342, "sampling/importance_sampling_ratio/mean": 0.9998822212219238, "sampling/importance_sampling_ratio/min": 0.5218978524208069, "sampling/sampling_logp_difference/max": 0.6502833366394043, "sampling/sampling_logp_difference/mean": 0.008168349973857403, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5663.0, "completions/mean_length": 2541.875, "completions/mean_terminated_length": 2296.217529296875, "completions/min_length": 658.0, "completions/min_terminated_length": 658.0, "entropy": 0.42669615894556046, "epoch": 0.1769521410579345, "frac_reward_zero_std": 0.0, "grad_norm": 0.13479273129916133, "kl": 0.0032771449768915772, "learning_rate": 9.252310462827515e-07, "loss": 0.1224, "num_tokens": 22132699.0, "reward": 2.5833334922790527, "reward_std": 0.647485077381134, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5926529169082642, "sampling/importance_sampling_ratio/mean": 0.9998801350593567, "sampling/importance_sampling_ratio/min": 0.7137778997421265, "sampling/sampling_logp_difference/max": 0.46540117263793945, "sampling/sampling_logp_difference/mean": 0.01025929395109415, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5292.0, "completions/max_terminated_length": 5292.0, "completions/mean_length": 2819.58349609375, "completions/mean_terminated_length": 2819.58349609375, "completions/min_length": 1138.0, "completions/min_terminated_length": 1138.0, "entropy": 0.2868489772081375, "epoch": 0.17758186397984888, "frac_reward_zero_std": 0.0, "grad_norm": 0.15800509552345807, "kl": 0.002760153671260923, "learning_rate": 9.247098769889541e-07, "loss": 0.0567, "num_tokens": 22238897.0, "reward": 2.3333334922790527, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999650120735168, "sampling/importance_sampling_ratio/min": 0.5870799422264099, "sampling/sampling_logp_difference/max": 0.9122864007949829, "sampling/sampling_logp_difference/mean": 0.00720578245818615, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3685.0, "completions/max_terminated_length": 3685.0, "completions/mean_length": 1631.041748046875, "completions/mean_terminated_length": 1631.041748046875, "completions/min_length": 900.0, "completions/min_terminated_length": 900.0, "entropy": 0.3201381489634514, "epoch": 0.17821158690176322, "frac_reward_zero_std": 0.0, "grad_norm": 0.11622848963435234, "kl": 0.0034998247283510864, "learning_rate": 9.241870454657219e-07, "loss": 0.0363, "num_tokens": 22293186.0, "reward": 2.75, "reward_std": 0.5260697603225708, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.2979313135147095, "sampling/importance_sampling_ratio/mean": 1.0001293420791626, "sampling/importance_sampling_ratio/min": 0.6955874562263489, "sampling/sampling_logp_difference/max": 0.36299848556518555, "sampling/sampling_logp_difference/mean": 0.008024930953979492, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4608.0, "completions/max_terminated_length": 4608.0, "completions/mean_length": 2537.08349609375, "completions/mean_terminated_length": 2537.08349609375, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.3054070584475994, "epoch": 0.17884130982367757, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07566172289557985, "kl": 0.0032833294826559722, "learning_rate": 9.236625537593127e-07, "loss": 0.0073, "num_tokens": 22374004.0, "reward": 2.7916667461395264, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7256911993026733, "sampling/importance_sampling_ratio/mean": 1.000190258026123, "sampling/importance_sampling_ratio/min": 0.3245081305503845, "sampling/sampling_logp_difference/max": 1.1254446506500244, "sampling/sampling_logp_difference/mean": 0.007746071554720402, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5421.0, "completions/max_terminated_length": 5421.0, "completions/mean_length": 1887.2083740234375, "completions/mean_terminated_length": 1887.2083740234375, "completions/min_length": 643.0, "completions/min_terminated_length": 643.0, "entropy": 0.4271187409758568, "epoch": 0.17947103274559195, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06872733630324084, "kl": 0.003923923242837191, "learning_rate": 9.23136403922482e-07, "loss": -0.0284, "num_tokens": 22426665.0, "reward": 2.5, "reward_std": 0.17817416787147522, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4565168619155884, "sampling/importance_sampling_ratio/mean": 0.9999055862426758, "sampling/importance_sampling_ratio/min": 0.6346858143806458, "sampling/sampling_logp_difference/max": 0.45462512969970703, "sampling/sampling_logp_difference/mean": 0.010930594056844711, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 7695.0, "completions/mean_length": 4491.08349609375, "completions/mean_terminated_length": 3750.900146484375, "completions/min_length": 782.0, "completions/min_terminated_length": 782.0, "entropy": 0.42760948836803436, "epoch": 0.1801007556675063, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09371200694573789, "kl": 0.0027672399301081896, "learning_rate": 9.226085980144752e-07, "loss": 0.1413, "num_tokens": 22552659.0, "reward": 2.0833334922790527, "reward_std": 0.7494649887084961, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.840079665184021, "sampling/importance_sampling_ratio/mean": 1.0001249313354492, "sampling/importance_sampling_ratio/min": 0.4832182824611664, "sampling/sampling_logp_difference/max": 0.7272868156433105, "sampling/sampling_logp_difference/mean": 0.01075330562889576, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5101.0, "completions/max_terminated_length": 5101.0, "completions/mean_length": 2154.20849609375, "completions/mean_terminated_length": 2154.20849609375, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "entropy": 0.3223307505249977, "epoch": 0.18073047858942065, "frac_reward_zero_std": 0.0, "grad_norm": 0.12749370962754633, "kl": 0.003108099161181599, "learning_rate": 9.220791381010187e-07, "loss": 0.1444, "num_tokens": 22616144.0, "reward": 2.5416667461395264, "reward_std": 0.46288391947746277, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000211000442505, "sampling/importance_sampling_ratio/min": 0.6554322242736816, "sampling/sampling_logp_difference/max": 0.8339405059814453, "sampling/sampling_logp_difference/mean": 0.008420777507126331, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3002.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 1811.5, "completions/mean_terminated_length": 1811.5, "completions/min_length": 774.0, "completions/min_terminated_length": 774.0, "entropy": 0.413737528026104, "epoch": 0.181360201511335, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09165078552061248, "kl": 0.0035568050807341933, "learning_rate": 9.215480262543127e-07, "loss": 0.0226, "num_tokens": 22670076.0, "reward": 2.2083334922790527, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5156501531600952, "sampling/importance_sampling_ratio/mean": 0.9998666644096375, "sampling/importance_sampling_ratio/min": 0.7826712131500244, "sampling/sampling_logp_difference/max": 0.4158444404602051, "sampling/sampling_logp_difference/mean": 0.009811358526349068, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6616.0, "completions/mean_length": 2464.75, "completions/mean_terminated_length": 2215.7392578125, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "entropy": 0.3121287673711777, "epoch": 0.18198992443324938, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10689598960513334, "kl": 0.003319262934383005, "learning_rate": 9.210152645530227e-07, "loss": 0.1485, "num_tokens": 22741414.0, "reward": 2.6666667461395264, "reward_std": 0.4714045226573944, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4201207160949707, "sampling/importance_sampling_ratio/mean": 1.0000017881393433, "sampling/importance_sampling_ratio/min": 0.05320177227258682, "sampling/sampling_logp_difference/max": 2.9336636066436768, "sampling/sampling_logp_difference/mean": 0.007882725447416306, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5620.0, "completions/max_terminated_length": 5620.0, "completions/mean_length": 2380.08349609375, "completions/mean_terminated_length": 2380.08349609375, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "entropy": 0.4331054165959358, "epoch": 0.18261964735516373, "frac_reward_zero_std": 0.0, "grad_norm": 0.1241786622710996, "kl": 0.0035523761762306094, "learning_rate": 9.204808550822712e-07, "loss": 0.0813, "num_tokens": 22807696.0, "reward": 2.375, "reward_std": 0.45032864809036255, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6689224243164062, "sampling/importance_sampling_ratio/mean": 1.0000721216201782, "sampling/importance_sampling_ratio/min": 0.4978328347206116, "sampling/sampling_logp_difference/max": 0.697490930557251, "sampling/sampling_logp_difference/mean": 0.0105275958776474, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4746.0, "completions/max_terminated_length": 4746.0, "completions/mean_length": 2919.25, "completions/mean_terminated_length": 2919.25, "completions/min_length": 1356.0, "completions/min_terminated_length": 1356.0, "entropy": 0.4804266393184662, "epoch": 0.18324937027707808, "frac_reward_zero_std": 0.0, "grad_norm": 0.0976170440675803, "kl": 0.0035426911199465394, "learning_rate": 9.1994479993363e-07, "loss": 0.0036, "num_tokens": 22891734.0, "reward": 2.0833334922790527, "reward_std": 0.7318329811096191, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000008225440979, "sampling/importance_sampling_ratio/min": 0.20166508853435516, "sampling/sampling_logp_difference/max": 1.601146936416626, "sampling/sampling_logp_difference/mean": 0.011691058985888958, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4749.0, "completions/max_terminated_length": 4749.0, "completions/mean_length": 2370.875, "completions/mean_terminated_length": 2370.875, "completions/min_length": 689.0, "completions/min_terminated_length": 689.0, "entropy": 0.3229430094361305, "epoch": 0.18387909319899245, "frac_reward_zero_std": 0.0, "grad_norm": 0.0950059103981524, "kl": 0.002895945857744664, "learning_rate": 9.194071012051115e-07, "loss": 0.0247, "num_tokens": 22962235.0, "reward": 2.4166667461395264, "reward_std": 0.4629100561141968, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434515476227, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5654182434082031, "sampling/importance_sampling_ratio/mean": 0.9999330043792725, "sampling/importance_sampling_ratio/min": 0.6447262167930603, "sampling/sampling_logp_difference/max": 0.448153018951416, "sampling/sampling_logp_difference/mean": 0.0078104035928845406, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6740.0, "completions/max_terminated_length": 6740.0, "completions/mean_length": 3401.875, "completions/mean_terminated_length": 3401.875, "completions/min_length": 818.0, "completions/min_terminated_length": 818.0, "entropy": 0.3300101235508919, "epoch": 0.1845088161209068, "frac_reward_zero_std": 0.0, "grad_norm": 0.07871895335984737, "kl": 0.003027097962331027, "learning_rate": 9.18867761001161e-07, "loss": -0.0276, "num_tokens": 23079576.0, "reward": 2.4583334922790527, "reward_std": 0.6440334320068359, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001723766326904, "sampling/importance_sampling_ratio/min": 0.6172886490821838, "sampling/sampling_logp_difference/max": 1.2421376705169678, "sampling/sampling_logp_difference/mean": 0.008515974506735802, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4752.0, "completions/max_terminated_length": 4752.0, "completions/mean_length": 2267.08349609375, "completions/mean_terminated_length": 2267.08349609375, "completions/min_length": 297.0, "completions/min_terminated_length": 297.0, "entropy": 0.4011530578136444, "epoch": 0.18513853904282115, "frac_reward_zero_std": 0.0, "grad_norm": 0.09664121481492696, "kl": 0.0037327150930650532, "learning_rate": 9.183267814326481e-07, "loss": -0.104, "num_tokens": 23149826.0, "reward": 2.4583334922790527, "reward_std": 0.759833037853241, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7242425680160522, "sampling/importance_sampling_ratio/mean": 1.0000940561294556, "sampling/importance_sampling_ratio/min": 0.4756200611591339, "sampling/sampling_logp_difference/max": 0.743135929107666, "sampling/sampling_logp_difference/mean": 0.00914597138762474, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7479.0, "completions/mean_length": 3503.875, "completions/mean_terminated_length": 3300.04345703125, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "entropy": 0.39085162431001663, "epoch": 0.1857682619647355, "frac_reward_zero_std": 0.0, "grad_norm": 0.09734704068999503, "kl": 0.003101348876953125, "learning_rate": 9.177841646168586e-07, "loss": 0.0493, "num_tokens": 23254823.0, "reward": 2.0833334922790527, "reward_std": 0.862645149230957, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.8333333134651184, "rewards/format_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.899147868156433, "sampling/importance_sampling_ratio/mean": 0.9999153017997742, "sampling/importance_sampling_ratio/min": 0.6039217710494995, "sampling/sampling_logp_difference/max": 0.6414053440093994, "sampling/sampling_logp_difference/mean": 0.009681997820734978, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 1591.125, "completions/mean_terminated_length": 1591.125, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "entropy": 0.28667084872722626, "epoch": 0.18639798488664988, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08260625635582915, "kl": 0.0032381184864789248, "learning_rate": 9.172399126774859e-07, "loss": 0.0611, "num_tokens": 23310674.0, "reward": 2.625, "reward_std": 0.4261821210384369, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.38890540599823, "sampling/importance_sampling_ratio/mean": 1.0000485181808472, "sampling/importance_sampling_ratio/min": 0.7378345131874084, "sampling/sampling_logp_difference/max": 0.32851600646972656, "sampling/sampling_logp_difference/mean": 0.007520824670791626, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7674.0, "completions/max_terminated_length": 7674.0, "completions/mean_length": 3923.041748046875, "completions/mean_terminated_length": 3923.041748046875, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "entropy": 0.4004623666405678, "epoch": 0.18702770780856423, "frac_reward_zero_std": 0.0, "grad_norm": 0.08361818658653178, "kl": 0.002813231374602765, "learning_rate": 9.166940277446233e-07, "loss": -0.0563, "num_tokens": 23431099.0, "reward": 2.3333334922790527, "reward_std": 0.46854168176651, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.549767017364502, "sampling/importance_sampling_ratio/mean": 0.9999144673347473, "sampling/importance_sampling_ratio/min": 0.5119611620903015, "sampling/sampling_logp_difference/max": 0.6695065498352051, "sampling/sampling_logp_difference/mean": 0.0106385238468647, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4447.0, "completions/max_terminated_length": 4447.0, "completions/mean_length": 1875.7083740234375, "completions/mean_terminated_length": 1875.7083740234375, "completions/min_length": 712.0, "completions/min_terminated_length": 712.0, "entropy": 0.4257367178797722, "epoch": 0.18765743073047858, "frac_reward_zero_std": 0.0, "grad_norm": 0.11157352259987176, "kl": 0.003858394338749349, "learning_rate": 9.16146511954755e-07, "loss": -0.0367, "num_tokens": 23484444.0, "reward": 2.375, "reward_std": 0.4493256211280823, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3940476179122925, "sampling/importance_sampling_ratio/mean": 0.9999217987060547, "sampling/importance_sampling_ratio/min": 0.4128086566925049, "sampling/sampling_logp_difference/max": 0.8847711086273193, "sampling/sampling_logp_difference/mean": 0.00985007919371128, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5365.0, "completions/max_terminated_length": 5365.0, "completions/mean_length": 2607.041748046875, "completions/mean_terminated_length": 2607.041748046875, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "entropy": 0.39961014688014984, "epoch": 0.18828715365239296, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07562719306951707, "kl": 0.003413626691326499, "learning_rate": 9.155973674507482e-07, "loss": 0.0255, "num_tokens": 23560733.0, "reward": 2.25, "reward_std": 0.38613972067832947, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6922067403793335, "sampling/importance_sampling_ratio/mean": 0.9999526143074036, "sampling/importance_sampling_ratio/min": 0.49864593148231506, "sampling/sampling_logp_difference/max": 0.6958589553833008, "sampling/sampling_logp_difference/mean": 0.009793180041015148, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4820.0, "completions/max_terminated_length": 4820.0, "completions/mean_length": 2661.541748046875, "completions/mean_terminated_length": 2661.541748046875, "completions/min_length": 928.0, "completions/min_terminated_length": 928.0, "entropy": 0.31844108924269676, "epoch": 0.1889168765743073, "frac_reward_zero_std": 0.0, "grad_norm": 0.09549956592029263, "kl": 0.0033782431855797768, "learning_rate": 9.150465963818446e-07, "loss": 0.0021, "num_tokens": 23654002.0, "reward": 1.875, "reward_std": 0.5787960886955261, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5350971221923828, "sampling/importance_sampling_ratio/mean": 1.0000810623168945, "sampling/importance_sampling_ratio/min": 0.5335642695426941, "sampling/sampling_logp_difference/max": 0.6281757354736328, "sampling/sampling_logp_difference/mean": 0.00826771929860115, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3495.0, "completions/max_terminated_length": 3495.0, "completions/mean_length": 1853.916748046875, "completions/mean_terminated_length": 1853.916748046875, "completions/min_length": 484.0, "completions/min_terminated_length": 484.0, "entropy": 0.30144575983285904, "epoch": 0.18954659949622166, "frac_reward_zero_std": 0.0, "grad_norm": 0.09640236307646347, "kl": 0.0036611839313991368, "learning_rate": 9.14494200903652e-07, "loss": -0.095, "num_tokens": 23716656.0, "reward": 2.5, "reward_std": 0.6218419671058655, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5099762678146362, "sampling/importance_sampling_ratio/mean": 0.9999739527702332, "sampling/importance_sampling_ratio/min": 0.3168613314628601, "sampling/sampling_logp_difference/max": 1.1492910385131836, "sampling/sampling_logp_difference/mean": 0.008284431882202625, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6208.0, "completions/max_terminated_length": 6208.0, "completions/mean_length": 2801.541748046875, "completions/mean_terminated_length": 2801.541748046875, "completions/min_length": 1451.0, "completions/min_terminated_length": 1451.0, "entropy": 0.4772297218441963, "epoch": 0.190176322418136, "frac_reward_zero_std": 0.0, "grad_norm": 0.10262275111668265, "kl": 0.0034289424074813724, "learning_rate": 9.139401831781357e-07, "loss": 0.0673, "num_tokens": 23792757.0, "reward": 2.8333334922790527, "reward_std": 0.3900056481361389, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000841617584229, "sampling/importance_sampling_ratio/min": 0.6853045225143433, "sampling/sampling_logp_difference/max": 0.7137622833251953, "sampling/sampling_logp_difference/mean": 0.011117285117506981, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7103.0, "completions/max_terminated_length": 7103.0, "completions/mean_length": 3144.25, "completions/mean_terminated_length": 3144.25, "completions/min_length": 1436.0, "completions/min_terminated_length": 1436.0, "entropy": 0.36568940430879593, "epoch": 0.19080604534005038, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08459135219628325, "kl": 0.002774488239083439, "learning_rate": 9.1338454537361e-07, "loss": 0.1253, "num_tokens": 23886339.0, "reward": 2.5416667461395264, "reward_std": 0.4082186222076416, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7089818716049194, "sampling/importance_sampling_ratio/mean": 1.0001283884048462, "sampling/importance_sampling_ratio/min": 0.42478179931640625, "sampling/sampling_logp_difference/max": 0.8561797142028809, "sampling/sampling_logp_difference/mean": 0.009492240846157074, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7494.0, "completions/mean_length": 4489.9169921875, "completions/mean_terminated_length": 4328.95654296875, "completions/min_length": 2191.0, "completions/min_terminated_length": 2191.0, "entropy": 0.3855496719479561, "epoch": 0.19143576826196473, "frac_reward_zero_std": 0.0, "grad_norm": 0.08666101603761, "kl": 0.0025701832491904497, "learning_rate": 9.128272896647302e-07, "loss": 0.1236, "num_tokens": 24020633.0, "reward": 2.0416667461395264, "reward_std": 0.701400637626648, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.9876841306686401, "sampling/importance_sampling_ratio/mean": 0.9999614357948303, "sampling/importance_sampling_ratio/min": 0.19338931143283844, "sampling/sampling_logp_difference/max": 1.643049955368042, "sampling/sampling_logp_difference/mean": 0.00983867235481739, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3721.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 2015.875, "completions/mean_terminated_length": 2015.875, "completions/min_length": 663.0, "completions/min_terminated_length": 663.0, "entropy": 0.2953627333045006, "epoch": 0.19206549118387908, "frac_reward_zero_std": 0.0, "grad_norm": 0.09837017571289428, "kl": 0.003606483282055706, "learning_rate": 9.122684182324836e-07, "loss": 0.0143, "num_tokens": 24090870.0, "reward": 2.5416667461395264, "reward_std": 0.6055297255516052, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4640631675720215, "sampling/importance_sampling_ratio/mean": 1.0000022649765015, "sampling/importance_sampling_ratio/min": 0.5543484687805176, "sampling/sampling_logp_difference/max": 0.5899618268013, "sampling/sampling_logp_difference/mean": 0.0077949874103069305, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4966.0, "completions/max_terminated_length": 4966.0, "completions/mean_length": 2086.33349609375, "completions/mean_terminated_length": 2086.33349609375, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "entropy": 0.4584585875272751, "epoch": 0.19269521410579346, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1102314890372206, "kl": 0.0036936001270078123, "learning_rate": 9.117079332641809e-07, "loss": -0.0116, "num_tokens": 24150686.0, "reward": 2.6666667461395264, "reward_std": 0.4993361234664917, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4443130493164062, "sampling/importance_sampling_ratio/mean": 1.0000940561294556, "sampling/importance_sampling_ratio/min": 0.5765435695648193, "sampling/sampling_logp_difference/max": 0.5507043600082397, "sampling/sampling_logp_difference/mean": 0.011159347370266914, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3367.0, "completions/max_terminated_length": 3367.0, "completions/mean_length": 1742.125, "completions/mean_terminated_length": 1742.125, "completions/min_length": 853.0, "completions/min_terminated_length": 853.0, "entropy": 0.44973722100257874, "epoch": 0.1933249370277078, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12151403131256572, "kl": 0.00356053322320804, "learning_rate": 9.111458369534481e-07, "loss": -0.0982, "num_tokens": 24201297.0, "reward": 2.25, "reward_std": 0.38613972067832947, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8690871000289917, "sampling/importance_sampling_ratio/mean": 0.9999852180480957, "sampling/importance_sampling_ratio/min": 0.6428571343421936, "sampling/sampling_logp_difference/max": 0.6254501342773438, "sampling/sampling_logp_difference/mean": 0.010673816315829754, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5983.0, "completions/max_terminated_length": 5983.0, "completions/mean_length": 3287.83349609375, "completions/mean_terminated_length": 3287.83349609375, "completions/min_length": 1680.0, "completions/min_terminated_length": 1680.0, "entropy": 0.4010486602783203, "epoch": 0.19395465994962216, "frac_reward_zero_std": 0.0, "grad_norm": 0.08846920835195979, "kl": 0.0031455042771995068, "learning_rate": 9.105821315002177e-07, "loss": -0.1043, "num_tokens": 24314837.0, "reward": 2.3333334922790527, "reward_std": 0.6279458999633789, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9289382696151733, "sampling/importance_sampling_ratio/mean": 0.9999759793281555, "sampling/importance_sampling_ratio/min": 0.605370283126831, "sampling/sampling_logp_difference/max": 0.6569697856903076, "sampling/sampling_logp_difference/mean": 0.010035906918346882, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4943.0, "completions/mean_length": 2667.875, "completions/mean_terminated_length": 2427.69580078125, "completions/min_length": 1064.0, "completions/min_terminated_length": 1064.0, "entropy": 0.4103725254535675, "epoch": 0.19458438287153654, "frac_reward_zero_std": 0.0, "grad_norm": 0.10496877420653687, "kl": 0.0027877037064172328, "learning_rate": 9.100168191107202e-07, "loss": 0.1973, "num_tokens": 24393858.0, "reward": 2.6666667461395264, "reward_std": 0.6649550199508667, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4363757371902466, "sampling/importance_sampling_ratio/mean": 0.999821662902832, "sampling/importance_sampling_ratio/min": 0.5311832427978516, "sampling/sampling_logp_difference/max": 0.632648229598999, "sampling/sampling_logp_difference/mean": 0.009314789436757565, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6307.0, "completions/mean_length": 3549.20849609375, "completions/mean_terminated_length": 3347.347900390625, "completions/min_length": 1378.0, "completions/min_terminated_length": 1378.0, "entropy": 0.42878131568431854, "epoch": 0.1952141057934509, "frac_reward_zero_std": 0.0, "grad_norm": 0.10121537504275414, "kl": 0.00298281415598467, "learning_rate": 9.09449901997475e-07, "loss": -0.099, "num_tokens": 24499383.0, "reward": 1.5833333730697632, "reward_std": 0.7378528118133545, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5925335884094238, "sampling/importance_sampling_ratio/mean": 0.9997551441192627, "sampling/importance_sampling_ratio/min": 0.17034737765789032, "sampling/sampling_logp_difference/max": 1.7699155807495117, "sampling/sampling_logp_difference/mean": 0.011338068172335625, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4878.0, "completions/mean_length": 2515.416748046875, "completions/mean_terminated_length": 2268.608642578125, "completions/min_length": 1324.0, "completions/min_terminated_length": 1324.0, "entropy": 0.5371106341481209, "epoch": 0.19584382871536524, "frac_reward_zero_std": 0.0, "grad_norm": 0.2849474332002813, "kl": 0.003162346896715462, "learning_rate": 9.08881382379282e-07, "loss": 0.025, "num_tokens": 24574361.0, "reward": 1.7083333730697632, "reward_std": 0.5930407047271729, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999680519104004, "sampling/importance_sampling_ratio/min": 0.4462907612323761, "sampling/sampling_logp_difference/max": 1.3376893997192383, "sampling/sampling_logp_difference/mean": 0.012335274368524551, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3820.0, "completions/max_terminated_length": 3820.0, "completions/mean_length": 2094.5, "completions/mean_terminated_length": 2094.5, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "entropy": 0.29288557916879654, "epoch": 0.1964735516372796, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09081032611415435, "kl": 0.003629433107562363, "learning_rate": 9.083112624812132e-07, "loss": -0.0262, "num_tokens": 24643157.0, "reward": 2.5, "reward_std": 0.30860671401023865, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001567602157593, "sampling/importance_sampling_ratio/min": 0.6167654991149902, "sampling/sampling_logp_difference/max": 0.8067870140075684, "sampling/sampling_logp_difference/mean": 0.007551092188805342, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6755.0, "completions/mean_length": 2833.666748046875, "completions/mean_terminated_length": 2600.69580078125, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "entropy": 0.3766615018248558, "epoch": 0.19710327455919396, "frac_reward_zero_std": 0.0, "grad_norm": 0.12798120155902643, "kl": 0.003104441799223423, "learning_rate": 9.07739544534604e-07, "loss": 0.2081, "num_tokens": 24726269.0, "reward": 2.5833334922790527, "reward_std": 0.7042440176010132, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7625888586044312, "sampling/importance_sampling_ratio/mean": 0.9999747276306152, "sampling/importance_sampling_ratio/min": 0.6053944826126099, "sampling/sampling_logp_difference/max": 0.5667836666107178, "sampling/sampling_logp_difference/mean": 0.010279249399900436, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7627.0, "completions/mean_length": 3708.125, "completions/mean_terminated_length": 3513.174072265625, "completions/min_length": 1126.0, "completions/min_terminated_length": 1126.0, "entropy": 0.42774833738803864, "epoch": 0.1977329974811083, "frac_reward_zero_std": 0.0, "grad_norm": 0.11499076739077925, "kl": 0.0032699269359000027, "learning_rate": 9.071662307770438e-07, "loss": -0.013, "num_tokens": 24834936.0, "reward": 2.375, "reward_std": 0.7194125056266785, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000425577163696, "sampling/importance_sampling_ratio/min": 0.43253737688064575, "sampling/sampling_logp_difference/max": 1.7151877880096436, "sampling/sampling_logp_difference/mean": 0.011919780634343624, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7009.0, "completions/max_terminated_length": 7009.0, "completions/mean_length": 3651.83349609375, "completions/mean_terminated_length": 3651.83349609375, "completions/min_length": 1869.0, "completions/min_terminated_length": 1869.0, "entropy": 0.5494667738676071, "epoch": 0.19836272040302266, "frac_reward_zero_std": 0.0, "grad_norm": 0.08966998991999087, "kl": 0.002971353242173791, "learning_rate": 9.065913234523679e-07, "loss": 0.0998, "num_tokens": 24933060.0, "reward": 1.875, "reward_std": 0.7192515730857849, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.608435034751892, "sampling/importance_sampling_ratio/mean": 1.0000885725021362, "sampling/importance_sampling_ratio/min": 0.5762887597084045, "sampling/sampling_logp_difference/max": 0.5511465072631836, "sampling/sampling_logp_difference/mean": 0.01230967789888382, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3020.0, "completions/max_terminated_length": 3020.0, "completions/mean_length": 1950.416748046875, "completions/mean_terminated_length": 1950.416748046875, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "entropy": 0.32315099239349365, "epoch": 0.19899244332493704, "frac_reward_zero_std": 0.0, "grad_norm": 0.16046267815324425, "kl": 0.0034824940375983715, "learning_rate": 9.060148248106484e-07, "loss": -0.0193, "num_tokens": 24997334.0, "reward": 2.5416667461395264, "reward_std": 0.6718525290489197, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4552662372589111, "sampling/importance_sampling_ratio/mean": 1.0001771450042725, "sampling/importance_sampling_ratio/min": 0.6351242065429688, "sampling/sampling_logp_difference/max": 0.4539346694946289, "sampling/sampling_logp_difference/mean": 0.007840659469366074, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7299.0, "completions/max_terminated_length": 7299.0, "completions/mean_length": 1857.666748046875, "completions/mean_terminated_length": 1857.666748046875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.32995258271694183, "epoch": 0.1996221662468514, "frac_reward_zero_std": 0.0, "grad_norm": 0.10622237086881438, "kl": 0.0032074283226393163, "learning_rate": 9.054367371081857e-07, "loss": 0.0368, "num_tokens": 25059886.0, "reward": 2.2916667461395264, "reward_std": 0.5383754968643188, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9007008075714111, "sampling/importance_sampling_ratio/mean": 0.9998922348022461, "sampling/importance_sampling_ratio/min": 0.5815222263336182, "sampling/sampling_logp_difference/max": 0.6422226428985596, "sampling/sampling_logp_difference/mean": 0.0087108314037323, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7117.0, "completions/max_terminated_length": 7117.0, "completions/mean_length": 2940.416748046875, "completions/mean_terminated_length": 2940.416748046875, "completions/min_length": 1264.0, "completions/min_terminated_length": 1264.0, "entropy": 0.5021751746535301, "epoch": 0.20025188916876574, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08637150730115198, "kl": 0.00292043830268085, "learning_rate": 9.048570626074993e-07, "loss": -0.0173, "num_tokens": 25138576.0, "reward": 2.0416667461395264, "reward_std": 0.46798479557037354, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6475157737731934, "sampling/importance_sampling_ratio/mean": 0.9999146461486816, "sampling/importance_sampling_ratio/min": 0.3923838436603546, "sampling/sampling_logp_difference/max": 0.9355146884918213, "sampling/sampling_logp_difference/mean": 0.011836141347885132, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5145.0, "completions/max_terminated_length": 5145.0, "completions/mean_length": 2134.125, "completions/mean_terminated_length": 2134.125, "completions/min_length": 640.0, "completions/min_terminated_length": 640.0, "entropy": 0.4135653227567673, "epoch": 0.2008816120906801, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11154031659925531, "kl": 0.0032010178547352552, "learning_rate": 9.042758035773191e-07, "loss": 0.0583, "num_tokens": 25203851.0, "reward": 2.5833334922790527, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3088243007659912, "sampling/importance_sampling_ratio/mean": 0.999991238117218, "sampling/importance_sampling_ratio/min": 0.6304877400398254, "sampling/sampling_logp_difference/max": 0.461261510848999, "sampling/sampling_logp_difference/mean": 0.009313980117440224, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5262.0, "completions/max_terminated_length": 5262.0, "completions/mean_length": 2451.95849609375, "completions/mean_terminated_length": 2451.95849609375, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "entropy": 0.4280291944742203, "epoch": 0.20151133501259447, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10764000677019764, "kl": 0.0032293383846990764, "learning_rate": 9.036929622925768e-07, "loss": -0.0646, "num_tokens": 25271026.0, "reward": 2.7083334922790527, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3770643472671509, "sampling/importance_sampling_ratio/mean": 0.9999249577522278, "sampling/importance_sampling_ratio/min": 0.6946262717247009, "sampling/sampling_logp_difference/max": 0.3643813133239746, "sampling/sampling_logp_difference/mean": 0.010169543325901031, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4512.0, "completions/max_terminated_length": 4512.0, "completions/mean_length": 2703.70849609375, "completions/mean_terminated_length": 2703.70849609375, "completions/min_length": 1238.0, "completions/min_terminated_length": 1238.0, "entropy": 0.33976753056049347, "epoch": 0.20214105793450882, "frac_reward_zero_std": 0.0, "grad_norm": 0.10870942080565277, "kl": 0.0028505646623671055, "learning_rate": 9.031085410343961e-07, "loss": -0.0158, "num_tokens": 25375339.0, "reward": 2.0416667461395264, "reward_std": 0.6274997591972351, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.7751991748809814, "sampling/importance_sampling_ratio/mean": 1.000092625617981, "sampling/importance_sampling_ratio/min": 0.35101318359375, "sampling/sampling_logp_difference/max": 1.046931505203247, "sampling/sampling_logp_difference/mean": 0.008709597401320934, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3213.0, "completions/max_terminated_length": 3213.0, "completions/mean_length": 1646.166748046875, "completions/mean_terminated_length": 1646.166748046875, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.2628306858241558, "epoch": 0.20277078085642317, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12854466750132032, "kl": 0.003480133193079382, "learning_rate": 9.025225420900853e-07, "loss": 0.0572, "num_tokens": 25425767.0, "reward": 2.6666667461395264, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3652182817459106, "sampling/importance_sampling_ratio/mean": 1.0001436471939087, "sampling/importance_sampling_ratio/min": 0.6625393033027649, "sampling/sampling_logp_difference/max": 0.41167545318603516, "sampling/sampling_logp_difference/mean": 0.006953134201467037, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6034.0, "completions/max_terminated_length": 6034.0, "completions/mean_length": 3259.25, "completions/mean_terminated_length": 3259.25, "completions/min_length": 1520.0, "completions/min_terminated_length": 1520.0, "entropy": 0.23829511180520058, "epoch": 0.20340050377833754, "frac_reward_zero_std": 0.0, "grad_norm": 0.06936005043013664, "kl": 0.0025436211144551635, "learning_rate": 9.019349677531265e-07, "loss": -0.1476, "num_tokens": 25524901.0, "reward": 2.125, "reward_std": 0.6440334320068359, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9947572946548462, "sampling/importance_sampling_ratio/mean": 0.9999880790710449, "sampling/importance_sampling_ratio/min": 0.33675017952919006, "sampling/sampling_logp_difference/max": 1.088413953781128, "sampling/sampling_logp_difference/mean": 0.006255451589822769, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4938.0, "completions/max_terminated_length": 4938.0, "completions/mean_length": 2377.291748046875, "completions/mean_terminated_length": 2377.291748046875, "completions/min_length": 1003.0, "completions/min_terminated_length": 1003.0, "entropy": 0.42710109055042267, "epoch": 0.2040302267002519, "frac_reward_zero_std": 0.0, "grad_norm": 0.12213233060808112, "kl": 0.0032583679421804845, "learning_rate": 9.013458203231683e-07, "loss": 0.0032, "num_tokens": 25595180.0, "reward": 2.5833334922790527, "reward_std": 0.6257078647613525, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4619165658950806, "sampling/importance_sampling_ratio/mean": 0.9999761581420898, "sampling/importance_sampling_ratio/min": 0.6585530042648315, "sampling/sampling_logp_difference/max": 0.4177103042602539, "sampling/sampling_logp_difference/mean": 0.010461735539138317, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6189.0, "completions/mean_length": 3508.95849609375, "completions/mean_terminated_length": 3305.347900390625, "completions/min_length": 1008.0, "completions/min_terminated_length": 1008.0, "entropy": 0.3000921383500099, "epoch": 0.20465994962216624, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06986091232845582, "kl": 0.002617401070892811, "learning_rate": 9.007551021060157e-07, "loss": 0.0876, "num_tokens": 25710179.0, "reward": 2.5, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.904389500617981, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.4844416379928589, "sampling/sampling_logp_difference/max": 0.7247583270072937, "sampling/sampling_logp_difference/mean": 0.007737878710031509, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7527.0, "completions/mean_length": 3022.75, "completions/mean_terminated_length": 2798.0, "completions/min_length": 1042.0, "completions/min_terminated_length": 1042.0, "entropy": 0.43323662132024765, "epoch": 0.2052896725440806, "frac_reward_zero_std": 0.0, "grad_norm": 0.10759711763977559, "kl": 0.0034750315244309604, "learning_rate": 9.001628154136216e-07, "loss": 0.0578, "num_tokens": 25798029.0, "reward": 2.5416667461395264, "reward_std": 0.6718525290489197, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6655305624008179, "sampling/importance_sampling_ratio/mean": 0.9999606609344482, "sampling/importance_sampling_ratio/min": 0.33892378211021423, "sampling/sampling_logp_difference/max": 1.0819799900054932, "sampling/sampling_logp_difference/mean": 0.010482619516551495, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5160.0, "completions/max_terminated_length": 5160.0, "completions/mean_length": 2153.541748046875, "completions/mean_terminated_length": 2153.541748046875, "completions/min_length": 877.0, "completions/min_terminated_length": 877.0, "entropy": 0.3449120298027992, "epoch": 0.20591939546599497, "frac_reward_zero_std": 0.0, "grad_norm": 0.11997112490904156, "kl": 0.003970728081185371, "learning_rate": 8.995689625640775e-07, "loss": -0.1058, "num_tokens": 25865546.0, "reward": 2.5, "reward_std": 0.6317276954650879, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4431145191192627, "sampling/importance_sampling_ratio/mean": 0.9999964237213135, "sampling/importance_sampling_ratio/min": 0.5672860145568848, "sampling/sampling_logp_difference/max": 0.5668916702270508, "sampling/sampling_logp_difference/mean": 0.009056910872459412, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4517.0, "completions/max_terminated_length": 4517.0, "completions/mean_length": 2536.041748046875, "completions/mean_terminated_length": 2536.041748046875, "completions/min_length": 1168.0, "completions/min_terminated_length": 1168.0, "entropy": 0.4359535574913025, "epoch": 0.20654911838790932, "frac_reward_zero_std": 0.0, "grad_norm": 0.11827367269123859, "kl": 0.0034396096016280353, "learning_rate": 8.989735458816044e-07, "loss": 0.0256, "num_tokens": 25939147.0, "reward": 2.3333334922790527, "reward_std": 0.6560657024383545, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5280604362487793, "sampling/importance_sampling_ratio/mean": 1.000114917755127, "sampling/importance_sampling_ratio/min": 0.6110793352127075, "sampling/sampling_logp_difference/max": 0.49252843856811523, "sampling/sampling_logp_difference/mean": 0.010726595297455788, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4812.0, "completions/max_terminated_length": 4812.0, "completions/mean_length": 1870.2083740234375, "completions/mean_terminated_length": 1870.2083740234375, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "entropy": 0.49909062683582306, "epoch": 0.20717884130982367, "frac_reward_zero_std": 0.0, "grad_norm": 0.1319707907583447, "kl": 0.0033126199850812554, "learning_rate": 8.983765676965445e-07, "loss": 0.1105, "num_tokens": 25991792.0, "reward": 2.375, "reward_std": 0.45032864809036255, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3669416904449463, "sampling/importance_sampling_ratio/mean": 0.9997658729553223, "sampling/importance_sampling_ratio/min": 0.49828967452049255, "sampling/sampling_logp_difference/max": 0.6965737342834473, "sampling/sampling_logp_difference/mean": 0.010696898214519024, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2750.0, "completions/max_terminated_length": 2750.0, "completions/mean_length": 1730.666748046875, "completions/mean_terminated_length": 1730.666748046875, "completions/min_length": 874.0, "completions/min_terminated_length": 874.0, "entropy": 0.2694992944598198, "epoch": 0.20780856423173805, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09361355661375045, "kl": 0.003186512680258602, "learning_rate": 8.977780303453504e-07, "loss": -0.0292, "num_tokens": 26047120.0, "reward": 2.6666667461395264, "reward_std": 0.43015047907829285, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4185734987258911, "sampling/importance_sampling_ratio/mean": 0.9998390078544617, "sampling/importance_sampling_ratio/min": 0.7352702021598816, "sampling/sampling_logp_difference/max": 0.3496518135070801, "sampling/sampling_logp_difference/mean": 0.006786121986806393, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6064.0, "completions/max_terminated_length": 6064.0, "completions/mean_length": 1753.7083740234375, "completions/mean_terminated_length": 1753.7083740234375, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "entropy": 0.3044595569372177, "epoch": 0.2084382871536524, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.09141588441791747, "kl": 0.0037687811418436468, "learning_rate": 8.971779361705776e-07, "loss": 0.091, "num_tokens": 26099929.0, "reward": 2.5416667461395264, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6759231090545654, "sampling/importance_sampling_ratio/mean": 0.9999001622200012, "sampling/importance_sampling_ratio/min": 0.2509756088256836, "sampling/sampling_logp_difference/max": 1.382399559020996, "sampling/sampling_logp_difference/mean": 0.007749505341053009, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6165.0, "completions/max_terminated_length": 6165.0, "completions/mean_length": 2584.83349609375, "completions/mean_terminated_length": 2584.83349609375, "completions/min_length": 1190.0, "completions/min_terminated_length": 1190.0, "entropy": 0.37204499542713165, "epoch": 0.20906801007556675, "frac_reward_zero_std": 0.0, "grad_norm": 0.08870541365199512, "kl": 0.0032524022390134633, "learning_rate": 8.965762875208746e-07, "loss": -0.0214, "num_tokens": 26183133.0, "reward": 2.3333334922790527, "reward_std": 0.5440332293510437, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6134141683578491, "sampling/importance_sampling_ratio/mean": 1.0000596046447754, "sampling/importance_sampling_ratio/min": 0.4610983431339264, "sampling/sampling_logp_difference/max": 0.7741439342498779, "sampling/sampling_logp_difference/mean": 0.009080990217626095, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4761.0, "completions/max_terminated_length": 4761.0, "completions/mean_length": 2212.08349609375, "completions/mean_terminated_length": 2212.08349609375, "completions/min_length": 698.0, "completions/min_terminated_length": 698.0, "entropy": 0.43791453540325165, "epoch": 0.2096977329974811, "frac_reward_zero_std": 0.0, "grad_norm": 0.11022809886325806, "kl": 0.004287134564947337, "learning_rate": 8.95973086750974e-07, "loss": 0.0552, "num_tokens": 26246575.0, "reward": 2.125, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4573042392730713, "sampling/importance_sampling_ratio/mean": 0.9999969005584717, "sampling/importance_sampling_ratio/min": 0.6663782000541687, "sampling/sampling_logp_difference/max": 0.405897855758667, "sampling/sampling_logp_difference/mean": 0.010096556507050991, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5721.0, "completions/mean_length": 3227.83349609375, "completions/mean_terminated_length": 3012.0, "completions/min_length": 1554.0, "completions/min_terminated_length": 1554.0, "entropy": 0.4636162593960762, "epoch": 0.21032745591939547, "frac_reward_zero_std": 0.0, "grad_norm": 0.11074607623635467, "kl": 0.0033940066932700574, "learning_rate": 8.953683362216824e-07, "loss": 0.0009, "num_tokens": 26349699.0, "reward": 1.9166667461395264, "reward_std": 0.8604943752288818, "rewards/cloze_reward/mean": 0.25, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999729990959167, "sampling/importance_sampling_ratio/min": 0.40049469470977783, "sampling/sampling_logp_difference/max": 0.9150547981262207, "sampling/sampling_logp_difference/mean": 0.011452188715338707, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2816.0, "completions/max_terminated_length": 2816.0, "completions/mean_length": 1724.375, "completions/mean_terminated_length": 1724.375, "completions/min_length": 566.0, "completions/min_terminated_length": 566.0, "entropy": 0.35427525639533997, "epoch": 0.21095717884130982, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09404336169288613, "kl": 0.0033970167860388756, "learning_rate": 8.947620382998727e-07, "loss": -0.0812, "num_tokens": 26399660.0, "reward": 2.75, "reward_std": 0.36585909128189087, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5782008171081543, "sampling/importance_sampling_ratio/mean": 1.0001260042190552, "sampling/importance_sampling_ratio/min": 0.73020339012146, "sampling/sampling_logp_difference/max": 0.4562854766845703, "sampling/sampling_logp_difference/mean": 0.008860539644956589, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7160.0, "completions/mean_length": 3803.916748046875, "completions/mean_terminated_length": 3613.130615234375, "completions/min_length": 1801.0, "completions/min_terminated_length": 1801.0, "entropy": 0.41124332696199417, "epoch": 0.21158690176322417, "frac_reward_zero_std": 0.0, "grad_norm": 0.09650116630896727, "kl": 0.0028099316405132413, "learning_rate": 8.941541953584734e-07, "loss": 0.0142, "num_tokens": 26509514.0, "reward": 2.2083334922790527, "reward_std": 0.8864989280700684, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000362396240234, "sampling/importance_sampling_ratio/min": 0.5505993366241455, "sampling/sampling_logp_difference/max": 0.8012127876281738, "sampling/sampling_logp_difference/mean": 0.010318456217646599, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5403.0, "completions/max_terminated_length": 5403.0, "completions/mean_length": 2610.33349609375, "completions/mean_terminated_length": 2610.33349609375, "completions/min_length": 1272.0, "completions/min_terminated_length": 1272.0, "entropy": 0.3986734375357628, "epoch": 0.21221662468513855, "frac_reward_zero_std": 0.0, "grad_norm": 0.10453886364923791, "kl": 0.003276212024502456, "learning_rate": 8.935448097764602e-07, "loss": 0.0718, "num_tokens": 26594866.0, "reward": 2.4583334922790527, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4066886901855469, "sampling/importance_sampling_ratio/mean": 1.0000829696655273, "sampling/importance_sampling_ratio/min": 0.3445505201816559, "sampling/sampling_logp_difference/max": 1.0655145645141602, "sampling/sampling_logp_difference/mean": 0.010076683945953846, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5442.0, "completions/max_terminated_length": 5442.0, "completions/mean_length": 3008.33349609375, "completions/mean_terminated_length": 3008.33349609375, "completions/min_length": 1380.0, "completions/min_terminated_length": 1380.0, "entropy": 0.4196186885237694, "epoch": 0.2128463476070529, "frac_reward_zero_std": 0.0, "grad_norm": 0.09740456081305637, "kl": 0.003040513605810702, "learning_rate": 8.929338839388461e-07, "loss": 0.0747, "num_tokens": 26683418.0, "reward": 2.7083334922790527, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.553633451461792, "sampling/importance_sampling_ratio/mean": 0.9999425411224365, "sampling/importance_sampling_ratio/min": 0.618931233882904, "sampling/sampling_logp_difference/max": 0.47976112365722656, "sampling/sampling_logp_difference/mean": 0.010076450183987617, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5611.0, "completions/max_terminated_length": 5611.0, "completions/mean_length": 2128.166748046875, "completions/mean_terminated_length": 2128.166748046875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "entropy": 0.3355030119419098, "epoch": 0.21347607052896725, "frac_reward_zero_std": 0.0, "grad_norm": 0.105605530828255, "kl": 0.0034716714872047305, "learning_rate": 8.923214202366727e-07, "loss": 0.0422, "num_tokens": 26744710.0, "reward": 2.5, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4783025979995728, "sampling/importance_sampling_ratio/mean": 1.0000542402267456, "sampling/importance_sampling_ratio/min": 0.6371110081672668, "sampling/sampling_logp_difference/max": 0.45081138610839844, "sampling/sampling_logp_difference/mean": 0.008548988029360771, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5827.0, "completions/max_terminated_length": 5827.0, "completions/mean_length": 2226.75, "completions/mean_terminated_length": 2226.75, "completions/min_length": 850.0, "completions/min_terminated_length": 850.0, "entropy": 0.3264845088124275, "epoch": 0.2141057934508816, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07789866762308183, "kl": 0.003281456301920116, "learning_rate": 8.917074210670001e-07, "loss": 0.0299, "num_tokens": 26807448.0, "reward": 2.5, "reward_std": 0.17817416787147522, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.628319263458252, "sampling/importance_sampling_ratio/mean": 0.999915361404419, "sampling/importance_sampling_ratio/min": 0.5498782396316528, "sampling/sampling_logp_difference/max": 0.5980584621429443, "sampling/sampling_logp_difference/mean": 0.008193101733922958, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4786.0, "completions/max_terminated_length": 4786.0, "completions/mean_length": 2496.0, "completions/mean_terminated_length": 2496.0, "completions/min_length": 890.0, "completions/min_terminated_length": 890.0, "entropy": 0.43051333725452423, "epoch": 0.21473551637279598, "frac_reward_zero_std": 0.0, "grad_norm": 0.10084098845033812, "kl": 0.0048521513235755265, "learning_rate": 8.910918888328983e-07, "loss": -0.0493, "num_tokens": 26884208.0, "reward": 2.2083334922790527, "reward_std": 0.7109179496765137, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4477763175964355, "sampling/importance_sampling_ratio/mean": 1.0000009536743164, "sampling/importance_sampling_ratio/min": 0.6260812878608704, "sampling/sampling_logp_difference/max": 0.4682750701904297, "sampling/sampling_logp_difference/mean": 0.01071916799992323, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5229.0, "completions/max_terminated_length": 5229.0, "completions/mean_length": 2554.041748046875, "completions/mean_terminated_length": 2554.041748046875, "completions/min_length": 541.0, "completions/min_terminated_length": 541.0, "entropy": 0.49234356731176376, "epoch": 0.21536523929471033, "frac_reward_zero_std": 0.0, "grad_norm": 0.11553883269563843, "kl": 0.003340679337270558, "learning_rate": 8.904748259434374e-07, "loss": -0.0055, "num_tokens": 26962689.0, "reward": 2.25, "reward_std": 0.6562265753746033, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7824058532714844, "sampling/importance_sampling_ratio/mean": 1.000136375427246, "sampling/importance_sampling_ratio/min": 0.6378293037414551, "sampling/sampling_logp_difference/max": 0.5779640674591064, "sampling/sampling_logp_difference/mean": 0.011965332552790642, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4075.0, "completions/max_terminated_length": 4075.0, "completions/mean_length": 2314.625, "completions/mean_terminated_length": 2314.625, "completions/min_length": 1001.0, "completions/min_terminated_length": 1001.0, "entropy": 0.30674873292446136, "epoch": 0.21599496221662468, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07671981383339731, "kl": 0.0031572734005749226, "learning_rate": 8.898562348136775e-07, "loss": -0.0863, "num_tokens": 27046472.0, "reward": 2.375, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7398329973220825, "sampling/importance_sampling_ratio/mean": 1.0000511407852173, "sampling/importance_sampling_ratio/min": 0.6440140604972839, "sampling/sampling_logp_difference/max": 0.5537891387939453, "sampling/sampling_logp_difference/mean": 0.007611554116010666, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4414.0, "completions/max_terminated_length": 4414.0, "completions/mean_length": 2471.70849609375, "completions/mean_terminated_length": 2471.70849609375, "completions/min_length": 678.0, "completions/min_terminated_length": 678.0, "entropy": 0.3573126047849655, "epoch": 0.21662468513853905, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.6878118110279704, "kl": 0.006431941408663988, "learning_rate": 8.892361178646611e-07, "loss": 0.013, "num_tokens": 27120873.0, "reward": 2.4583334922790527, "reward_std": 0.3053751587867737, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999983012676239, "sampling/importance_sampling_ratio/min": 0.0008686580113135278, "sampling/sampling_logp_difference/max": 7.048561096191406, "sampling/sampling_logp_difference/mean": 0.009029354900121689, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8089.0, "completions/max_terminated_length": 8089.0, "completions/mean_length": 3207.916748046875, "completions/mean_terminated_length": 3207.916748046875, "completions/min_length": 1261.0, "completions/min_terminated_length": 1261.0, "entropy": 0.31546300649642944, "epoch": 0.2172544080604534, "frac_reward_zero_std": 0.0, "grad_norm": 0.06852499024527194, "kl": 0.002601585874799639, "learning_rate": 8.886144775234015e-07, "loss": -0.2246, "num_tokens": 27226527.0, "reward": 2.2916667461395264, "reward_std": 0.5787960886955261, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4250026941299438, "sampling/importance_sampling_ratio/mean": 1.0000436305999756, "sampling/importance_sampling_ratio/min": 0.6680464148521423, "sampling/sampling_logp_difference/max": 0.4033975601196289, "sampling/sampling_logp_difference/mean": 0.007326172664761543, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5738.0, "completions/max_terminated_length": 5738.0, "completions/mean_length": 2619.041748046875, "completions/mean_terminated_length": 2619.041748046875, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "entropy": 0.2812144234776497, "epoch": 0.21788413098236775, "frac_reward_zero_std": 0.0, "grad_norm": 0.07689729431825748, "kl": 0.0032260846928693354, "learning_rate": 8.879913162228742e-07, "loss": -0.1262, "num_tokens": 27316584.0, "reward": 2.375, "reward_std": 0.5049939155578613, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.466253399848938, "sampling/importance_sampling_ratio/mean": 0.9999852180480957, "sampling/importance_sampling_ratio/min": 0.5835570693016052, "sampling/sampling_logp_difference/max": 0.5386130809783936, "sampling/sampling_logp_difference/mean": 0.0070577459409832954, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6273.0, "completions/max_terminated_length": 6273.0, "completions/mean_length": 3408.0, "completions/mean_terminated_length": 3408.0, "completions/min_length": 1700.0, "completions/min_terminated_length": 1700.0, "entropy": 0.5257942974567413, "epoch": 0.21851385390428213, "frac_reward_zero_std": 0.0, "grad_norm": 0.1151800986661764, "kl": 0.003427927556913346, "learning_rate": 8.873666364020084e-07, "loss": 0.0281, "num_tokens": 27413736.0, "reward": 2.4166667461395264, "reward_std": 0.4857778251171112, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.8144627809524536, "sampling/importance_sampling_ratio/mean": 1.0000367164611816, "sampling/importance_sampling_ratio/min": 0.14064651727676392, "sampling/sampling_logp_difference/max": 1.9615055322647095, "sampling/sampling_logp_difference/mean": 0.012324482202529907, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3987.0, "completions/max_terminated_length": 3987.0, "completions/mean_length": 2316.291748046875, "completions/mean_terminated_length": 2316.291748046875, "completions/min_length": 1156.0, "completions/min_terminated_length": 1156.0, "entropy": 0.29069532454013824, "epoch": 0.21914357682619648, "frac_reward_zero_std": 0.0, "grad_norm": 0.09864850963387488, "kl": 0.0028678098460659385, "learning_rate": 8.867404405056755e-07, "loss": 0.0014, "num_tokens": 27492615.0, "reward": 2.4166667461395264, "reward_std": 0.6257078647613525, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.721524953842163, "sampling/importance_sampling_ratio/mean": 1.0000873804092407, "sampling/importance_sampling_ratio/min": 0.6692290902137756, "sampling/sampling_logp_difference/max": 0.543210506439209, "sampling/sampling_logp_difference/mean": 0.007591416593641043, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4554.0, "completions/max_terminated_length": 4554.0, "completions/mean_length": 2261.0, "completions/mean_terminated_length": 2261.0, "completions/min_length": 638.0, "completions/min_terminated_length": 638.0, "entropy": 0.27323298528790474, "epoch": 0.21977329974811083, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09307308310082026, "kl": 0.003048075595870614, "learning_rate": 8.861127309846809e-07, "loss": -0.1047, "num_tokens": 27572999.0, "reward": 2.5, "reward_std": 0.45069071650505066, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6343977451324463, "sampling/importance_sampling_ratio/mean": 1.0001115798950195, "sampling/importance_sampling_ratio/min": 0.12701009213924408, "sampling/sampling_logp_difference/max": 2.063488721847534, "sampling/sampling_logp_difference/mean": 0.007326766848564148, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3988.0, "completions/max_terminated_length": 3988.0, "completions/mean_length": 1995.3333740234375, "completions/mean_terminated_length": 1995.3333740234375, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "entropy": 0.40310245007276535, "epoch": 0.22040302267002518, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0936953020139499, "kl": 0.0036091682268306613, "learning_rate": 8.854835102957541e-07, "loss": -0.0181, "num_tokens": 27631055.0, "reward": 2.5416667461395264, "reward_std": 0.367926687002182, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.575102686882019, "sampling/importance_sampling_ratio/mean": 0.9999661445617676, "sampling/importance_sampling_ratio/min": 0.47002965211868286, "sampling/sampling_logp_difference/max": 0.7549595832824707, "sampling/sampling_logp_difference/mean": 0.009571608155965805, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3229.0, "completions/max_terminated_length": 3229.0, "completions/mean_length": 1838.625, "completions/mean_terminated_length": 1838.625, "completions/min_length": 800.0, "completions/min_terminated_length": 800.0, "entropy": 0.3027261793613434, "epoch": 0.22103274559193956, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11366069641744897, "kl": 0.0035535451606847346, "learning_rate": 8.848527809015387e-07, "loss": -0.0616, "num_tokens": 27687854.0, "reward": 2.6666667461395264, "reward_std": 0.42052432894706726, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5520285367965698, "sampling/importance_sampling_ratio/mean": 0.9998032450675964, "sampling/importance_sampling_ratio/min": 0.6219798922538757, "sampling/sampling_logp_difference/max": 0.4748474955558777, "sampling/sampling_logp_difference/mean": 0.007637335918843746, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5608.0, "completions/max_terminated_length": 5608.0, "completions/mean_length": 2393.041748046875, "completions/mean_terminated_length": 2393.041748046875, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "entropy": 0.34766021370887756, "epoch": 0.2216624685138539, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07551632302386051, "kl": 0.003283481695689261, "learning_rate": 8.842205452705835e-07, "loss": -0.02, "num_tokens": 27755519.0, "reward": 2.4583334922790527, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6520028114318848, "sampling/importance_sampling_ratio/mean": 1.0000184774398804, "sampling/importance_sampling_ratio/min": 0.6991065144538879, "sampling/sampling_logp_difference/max": 0.501988410949707, "sampling/sampling_logp_difference/mean": 0.008451755158603191, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5487.0, "completions/max_terminated_length": 5487.0, "completions/mean_length": 2937.45849609375, "completions/mean_terminated_length": 2937.45849609375, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.383574403822422, "epoch": 0.22229219143576825, "frac_reward_zero_std": 0.0, "grad_norm": 0.09060443036661532, "kl": 0.0035539448144845665, "learning_rate": 8.835868058773322e-07, "loss": -0.0667, "num_tokens": 27849786.0, "reward": 2.125, "reward_std": 0.503990888595581, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.572704792022705, "sampling/importance_sampling_ratio/mean": 0.999988853931427, "sampling/importance_sampling_ratio/min": 0.37463611364364624, "sampling/sampling_logp_difference/max": 0.9818000793457031, "sampling/sampling_logp_difference/mean": 0.00942138209939003, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6802.0, "completions/max_terminated_length": 6802.0, "completions/mean_length": 2653.75, "completions/mean_terminated_length": 2653.75, "completions/min_length": 760.0, "completions/min_terminated_length": 760.0, "entropy": 0.42428842931985855, "epoch": 0.22292191435768263, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.10635341117791619, "kl": 0.0038975548814050853, "learning_rate": 8.829515652021138e-07, "loss": -0.0708, "num_tokens": 27922524.0, "reward": 2.5416667461395264, "reward_std": 0.24800793826580048, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.790175199508667, "sampling/importance_sampling_ratio/mean": 1.0002413988113403, "sampling/importance_sampling_ratio/min": 0.5300139784812927, "sampling/sampling_logp_difference/max": 0.6348519325256348, "sampling/sampling_logp_difference/mean": 0.010368994437158108, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6258.0, "completions/max_terminated_length": 6258.0, "completions/mean_length": 2927.791748046875, "completions/mean_terminated_length": 2927.791748046875, "completions/min_length": 1016.0, "completions/min_terminated_length": 1016.0, "entropy": 0.45628170669078827, "epoch": 0.22355163727959698, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1036882337897984, "kl": 0.003594797453843057, "learning_rate": 8.823148257311332e-07, "loss": -0.0763, "num_tokens": 28006399.0, "reward": 2.125, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6960599422454834, "sampling/importance_sampling_ratio/mean": 0.9999270439147949, "sampling/importance_sampling_ratio/min": 0.3841380774974823, "sampling/sampling_logp_difference/max": 0.9567532539367676, "sampling/sampling_logp_difference/mean": 0.01111992634832859, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6847.0, "completions/max_terminated_length": 6847.0, "completions/mean_length": 2806.416748046875, "completions/mean_terminated_length": 2806.416748046875, "completions/min_length": 1038.0, "completions/min_terminated_length": 1038.0, "entropy": 0.3687780871987343, "epoch": 0.22418136020151133, "frac_reward_zero_std": 0.0, "grad_norm": 0.08837819841753326, "kl": 0.0034112147986888885, "learning_rate": 8.816765899564612e-07, "loss": 0.022, "num_tokens": 28085289.0, "reward": 2.125, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3406540155410767, "sampling/importance_sampling_ratio/mean": 0.9999015927314758, "sampling/importance_sampling_ratio/min": 0.4277111887931824, "sampling/sampling_logp_difference/max": 0.8493070602416992, "sampling/sampling_logp_difference/mean": 0.009527037851512432, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6188.0, "completions/max_terminated_length": 6188.0, "completions/mean_length": 2978.625, "completions/mean_terminated_length": 2978.625, "completions/min_length": 991.0, "completions/min_terminated_length": 991.0, "entropy": 0.46392541378736496, "epoch": 0.22481108312342568, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10106595013039887, "kl": 0.003435912076383829, "learning_rate": 8.810368603760249e-07, "loss": 0.0023, "num_tokens": 28168512.0, "reward": 2.5833334922790527, "reward_std": 0.41387641429901123, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7125033140182495, "sampling/importance_sampling_ratio/mean": 0.9997785687446594, "sampling/importance_sampling_ratio/min": 0.5490427613258362, "sampling/sampling_logp_difference/max": 0.599578857421875, "sampling/sampling_logp_difference/mean": 0.011031635105609894, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3837.0, "completions/max_terminated_length": 3837.0, "completions/mean_length": 2357.625, "completions/mean_terminated_length": 2357.625, "completions/min_length": 1048.0, "completions/min_terminated_length": 1048.0, "entropy": 0.3660719767212868, "epoch": 0.22544080604534006, "frac_reward_zero_std": 0.0, "grad_norm": 0.10086654633529599, "kl": 0.00319903576746583, "learning_rate": 8.803956394935977e-07, "loss": -0.0151, "num_tokens": 28243159.0, "reward": 2.375, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998900890350342, "sampling/importance_sampling_ratio/min": 0.47644856572151184, "sampling/sampling_logp_difference/max": 0.7475066184997559, "sampling/sampling_logp_difference/mean": 0.008928487077355385, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5544.0, "completions/max_terminated_length": 5544.0, "completions/mean_length": 2400.625, "completions/mean_terminated_length": 2400.625, "completions/min_length": 814.0, "completions/min_terminated_length": 814.0, "entropy": 0.37393002212047577, "epoch": 0.2260705289672544, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10229983406282321, "kl": 0.003743716166354716, "learning_rate": 8.797529298187898e-07, "loss": 0.0048, "num_tokens": 28314446.0, "reward": 2.5, "reward_std": 0.5815500020980835, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5844759941101074, "sampling/importance_sampling_ratio/mean": 1.0000890493392944, "sampling/importance_sampling_ratio/min": 0.6248340010643005, "sampling/sampling_logp_difference/max": 0.47026920318603516, "sampling/sampling_logp_difference/mean": 0.009276754222810268, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6435.0, "completions/max_terminated_length": 6435.0, "completions/mean_length": 2581.08349609375, "completions/mean_terminated_length": 2581.08349609375, "completions/min_length": 1065.0, "completions/min_terminated_length": 1065.0, "entropy": 0.43828085064888, "epoch": 0.22670025188916876, "frac_reward_zero_std": 0.0, "grad_norm": 0.10514040039992371, "kl": 0.004168162762653083, "learning_rate": 8.791087338670382e-07, "loss": -0.0547, "num_tokens": 28387144.0, "reward": 2.25, "reward_std": 0.7962853312492371, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6488659381866455, "sampling/importance_sampling_ratio/mean": 0.9998676180839539, "sampling/importance_sampling_ratio/min": 0.7026621699333191, "sampling/sampling_logp_difference/max": 0.5000877380371094, "sampling/sampling_logp_difference/mean": 0.010207278653979301, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3998.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 1926.541748046875, "completions/mean_terminated_length": 1926.541748046875, "completions/min_length": 653.0, "completions/min_terminated_length": 653.0, "entropy": 0.28631681576371193, "epoch": 0.22732997481108314, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08347365183071076, "kl": 0.0037225084379315376, "learning_rate": 8.78463054159597e-07, "loss": -0.0102, "num_tokens": 28453805.0, "reward": 2.4583334922790527, "reward_std": 0.29602527618408203, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4121323823928833, "sampling/importance_sampling_ratio/mean": 0.9999871850013733, "sampling/importance_sampling_ratio/min": 0.5413921475410461, "sampling/sampling_logp_difference/max": 0.6136113405227661, "sampling/sampling_logp_difference/mean": 0.007168746553361416, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7949.0, "completions/mean_length": 3979.45849609375, "completions/mean_terminated_length": 3796.304443359375, "completions/min_length": 1391.0, "completions/min_terminated_length": 1391.0, "entropy": 0.4074585288763046, "epoch": 0.22795969773299748, "frac_reward_zero_std": 0.0, "grad_norm": 0.1456501767993873, "kl": 0.0027542277821339667, "learning_rate": 8.778158932235272e-07, "loss": 0.0054, "num_tokens": 28566880.0, "reward": 2.25, "reward_std": 0.8443689346313477, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5675089359283447, "sampling/importance_sampling_ratio/mean": 0.9999898076057434, "sampling/importance_sampling_ratio/min": 0.19109593331813812, "sampling/sampling_logp_difference/max": 1.6549797058105469, "sampling/sampling_logp_difference/mean": 0.009977209381759167, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7712.0, "completions/max_terminated_length": 7712.0, "completions/mean_length": 2985.875, "completions/mean_terminated_length": 2985.875, "completions/min_length": 735.0, "completions/min_terminated_length": 735.0, "entropy": 0.3779531270265579, "epoch": 0.22858942065491183, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0923751799890892, "kl": 0.0033961799927055836, "learning_rate": 8.771672535916871e-07, "loss": -0.0555, "num_tokens": 28648013.0, "reward": 2.7916667461395264, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5834177732467651, "sampling/importance_sampling_ratio/mean": 0.999920666217804, "sampling/importance_sampling_ratio/min": 0.5514373779296875, "sampling/sampling_logp_difference/max": 0.5952270030975342, "sampling/sampling_logp_difference/mean": 0.009446922689676285, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3726.0, "completions/max_terminated_length": 3726.0, "completions/mean_length": 2195.70849609375, "completions/mean_terminated_length": 2195.70849609375, "completions/min_length": 1288.0, "completions/min_terminated_length": 1288.0, "entropy": 0.2956593930721283, "epoch": 0.22921914357682618, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0921179446034368, "kl": 0.0035016892943531275, "learning_rate": 8.765171378027225e-07, "loss": -0.065, "num_tokens": 28712678.0, "reward": 2.7916667461395264, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5522269010543823, "sampling/importance_sampling_ratio/mean": 1.0000749826431274, "sampling/importance_sampling_ratio/min": 0.5628889203071594, "sampling/sampling_logp_difference/max": 0.5746729373931885, "sampling/sampling_logp_difference/mean": 0.007556205149739981, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5685.0, "completions/max_terminated_length": 5685.0, "completions/mean_length": 2845.83349609375, "completions/mean_terminated_length": 2845.83349609375, "completions/min_length": 574.0, "completions/min_terminated_length": 574.0, "entropy": 0.4551406055688858, "epoch": 0.22984886649874056, "frac_reward_zero_std": 0.0, "grad_norm": 0.10000270718996478, "kl": 0.0031773638329468668, "learning_rate": 8.758655484010565e-07, "loss": -0.1229, "num_tokens": 28791834.0, "reward": 2.2916667461395264, "reward_std": 0.5660459995269775, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6423593759536743, "sampling/importance_sampling_ratio/mean": 0.9999520182609558, "sampling/importance_sampling_ratio/min": 0.5388622879981995, "sampling/sampling_logp_difference/max": 0.6182951927185059, "sampling/sampling_logp_difference/mean": 0.01032539177685976, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5458.0, "completions/mean_length": 2991.08349609375, "completions/mean_terminated_length": 2764.95654296875, "completions/min_length": 1532.0, "completions/min_terminated_length": 1532.0, "entropy": 0.30518173798918724, "epoch": 0.2304785894206549, "frac_reward_zero_std": 0.0, "grad_norm": 0.08844851846294527, "kl": 0.0034815220860764384, "learning_rate": 8.752124879368792e-07, "loss": 0.0491, "num_tokens": 28892940.0, "reward": 2.25, "reward_std": 0.7626279592514038, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000139594078064, "sampling/importance_sampling_ratio/min": 0.689494252204895, "sampling/sampling_logp_difference/max": 0.7893617153167725, "sampling/sampling_logp_difference/mean": 0.008228064514696598, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6181.0, "completions/max_terminated_length": 6181.0, "completions/mean_length": 2382.166748046875, "completions/mean_terminated_length": 2382.166748046875, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "entropy": 0.4211115464568138, "epoch": 0.23110831234256926, "frac_reward_zero_std": 0.0, "grad_norm": 0.11464386068248701, "kl": 0.004011501849163324, "learning_rate": 8.74557958966139e-07, "loss": 0.0455, "num_tokens": 28959608.0, "reward": 2.75, "reward_std": 0.41387641429901123, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5769569873809814, "sampling/importance_sampling_ratio/mean": 0.9998785853385925, "sampling/importance_sampling_ratio/min": 0.6715154051780701, "sampling/sampling_logp_difference/max": 0.45549702644348145, "sampling/sampling_logp_difference/mean": 0.010428758338093758, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7961.0, "completions/max_terminated_length": 7961.0, "completions/mean_length": 4248.08349609375, "completions/mean_terminated_length": 4248.08349609375, "completions/min_length": 1450.0, "completions/min_terminated_length": 1450.0, "entropy": 0.3942534923553467, "epoch": 0.23173803526448364, "frac_reward_zero_std": 0.0, "grad_norm": 0.07891009859171759, "kl": 0.0027012189966626465, "learning_rate": 8.73901964050531e-07, "loss": -0.0618, "num_tokens": 29081314.0, "reward": 1.7916667461395264, "reward_std": 0.5317275524139404, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.988948941230774, "sampling/importance_sampling_ratio/mean": 0.9999208450317383, "sampling/importance_sampling_ratio/min": 0.2316516935825348, "sampling/sampling_logp_difference/max": 1.4625203609466553, "sampling/sampling_logp_difference/mean": 0.009695146232843399, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8159.0, "completions/max_terminated_length": 8159.0, "completions/mean_length": 2935.291748046875, "completions/mean_terminated_length": 2935.291748046875, "completions/min_length": 1847.0, "completions/min_terminated_length": 1847.0, "entropy": 0.27999172732234, "epoch": 0.232367758186398, "frac_reward_zero_std": 0.0, "grad_norm": 0.08087143941258933, "kl": 0.003412327030673623, "learning_rate": 8.732445057574879e-07, "loss": 0.0029, "num_tokens": 29179857.0, "reward": 2.2083334922790527, "reward_std": 0.6026668548583984, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7473949193954468, "sampling/importance_sampling_ratio/mean": 1.0000661611557007, "sampling/importance_sampling_ratio/min": 0.332884281873703, "sampling/sampling_logp_difference/max": 1.0999603271484375, "sampling/sampling_logp_difference/mean": 0.00761954253539443, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5837.0, "completions/max_terminated_length": 5837.0, "completions/mean_length": 3172.541748046875, "completions/mean_terminated_length": 3172.541748046875, "completions/min_length": 1624.0, "completions/min_terminated_length": 1624.0, "entropy": 0.46752408891916275, "epoch": 0.23299748110831234, "frac_reward_zero_std": 0.0, "grad_norm": 0.10445580674193455, "kl": 0.0030520972795784473, "learning_rate": 8.725855866601702e-07, "loss": -0.0001, "num_tokens": 29272222.0, "reward": 2.5416667461395264, "reward_std": 0.4082186818122864, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4773002862930298, "sampling/importance_sampling_ratio/mean": 1.0000905990600586, "sampling/importance_sampling_ratio/min": 0.7263866662979126, "sampling/sampling_logp_difference/max": 0.3902163505554199, "sampling/sampling_logp_difference/mean": 0.0108785480260849, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7625.0, "completions/max_terminated_length": 7625.0, "completions/mean_length": 3477.0, "completions/mean_terminated_length": 3477.0, "completions/min_length": 969.0, "completions/min_terminated_length": 969.0, "entropy": 0.3234460800886154, "epoch": 0.2336272040302267, "frac_reward_zero_std": 0.0, "grad_norm": 0.08385907249013089, "kl": 0.003239122743252665, "learning_rate": 8.719252093374551e-07, "loss": 0.0597, "num_tokens": 29382182.0, "reward": 2.125, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9330682754516602, "sampling/importance_sampling_ratio/mean": 0.9999256730079651, "sampling/importance_sampling_ratio/min": 0.5694210529327393, "sampling/sampling_logp_difference/max": 0.6591085195541382, "sampling/sampling_logp_difference/mean": 0.008415883406996727, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4294.0, "completions/max_terminated_length": 4294.0, "completions/mean_length": 1929.041748046875, "completions/mean_terminated_length": 1929.041748046875, "completions/min_length": 914.0, "completions/min_terminated_length": 914.0, "entropy": 0.3968869745731354, "epoch": 0.23425692695214106, "frac_reward_zero_std": 0.0, "grad_norm": 0.10812076093291063, "kl": 0.0038793698768131435, "learning_rate": 8.712633763739271e-07, "loss": 0.0191, "num_tokens": 29436919.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4396876096725464, "sampling/importance_sampling_ratio/mean": 1.0000511407852173, "sampling/importance_sampling_ratio/min": 0.48656415939331055, "sampling/sampling_logp_difference/max": 0.7203865051269531, "sampling/sampling_logp_difference/mean": 0.009327278472483158, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7723.0, "completions/mean_length": 4082.33349609375, "completions/mean_terminated_length": 3903.65234375, "completions/min_length": 1877.0, "completions/min_terminated_length": 1877.0, "entropy": 0.381720133125782, "epoch": 0.23488664987405541, "frac_reward_zero_std": 0.0, "grad_norm": 0.08882671197668973, "kl": 0.0031239804811775684, "learning_rate": 8.706000903598681e-07, "loss": 0.0075, "num_tokens": 29564615.0, "reward": 2.0833334922790527, "reward_std": 0.6582559943199158, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6996091604232788, "sampling/importance_sampling_ratio/mean": 1.0000609159469604, "sampling/importance_sampling_ratio/min": 0.49608954787254333, "sampling/sampling_logp_difference/max": 0.7009987831115723, "sampling/sampling_logp_difference/mean": 0.009965268895030022, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6465.0, "completions/max_terminated_length": 6465.0, "completions/mean_length": 3091.33349609375, "completions/mean_terminated_length": 3091.33349609375, "completions/min_length": 1418.0, "completions/min_terminated_length": 1418.0, "entropy": 0.5462045073509216, "epoch": 0.23551637279596976, "frac_reward_zero_std": 0.0, "grad_norm": 0.10014256904455533, "kl": 0.004098358505871147, "learning_rate": 8.699353538912467e-07, "loss": -0.0122, "num_tokens": 29657175.0, "reward": 2.25, "reward_std": 0.8928797245025635, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6645317077636719, "sampling/importance_sampling_ratio/mean": 0.999804675579071, "sampling/importance_sampling_ratio/min": 0.34111154079437256, "sampling/sampling_logp_difference/max": 1.0755457878112793, "sampling/sampling_logp_difference/mean": 0.012496387585997581, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5535.0, "completions/max_terminated_length": 5535.0, "completions/mean_length": 2510.125, "completions/mean_terminated_length": 2510.125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "entropy": 0.42511890083551407, "epoch": 0.23614609571788414, "frac_reward_zero_std": 0.0, "grad_norm": 0.12681463129738135, "kl": 0.0036894825752824545, "learning_rate": 8.692691695697085e-07, "loss": -0.0706, "num_tokens": 29730930.0, "reward": 2.2916667461395264, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6994876861572266, "sampling/importance_sampling_ratio/mean": 0.999862015247345, "sampling/importance_sampling_ratio/min": 0.32228708267211914, "sampling/sampling_logp_difference/max": 1.132312536239624, "sampling/sampling_logp_difference/mean": 0.010822772048413754, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4255.0, "completions/max_terminated_length": 4255.0, "completions/mean_length": 1708.0833740234375, "completions/mean_terminated_length": 1708.0833740234375, "completions/min_length": 708.0, "completions/min_terminated_length": 708.0, "entropy": 0.41312387585639954, "epoch": 0.2367758186397985, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07735334788450503, "kl": 0.0039310145657509565, "learning_rate": 8.686015400025654e-07, "loss": 0.0102, "num_tokens": 29785852.0, "reward": 2.5416667461395264, "reward_std": 0.24800793826580048, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.53260338306427, "sampling/importance_sampling_ratio/mean": 1.000014305114746, "sampling/importance_sampling_ratio/min": 0.6108437180519104, "sampling/sampling_logp_difference/max": 0.49291419982910156, "sampling/sampling_logp_difference/mean": 0.009703470394015312, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4986.0, "completions/max_terminated_length": 4986.0, "completions/mean_length": 2705.125, "completions/mean_terminated_length": 2705.125, "completions/min_length": 1344.0, "completions/min_terminated_length": 1344.0, "entropy": 0.39158955961465836, "epoch": 0.23740554156171284, "frac_reward_zero_std": 0.0, "grad_norm": 0.09323654570410486, "kl": 0.003581938217394054, "learning_rate": 8.67932467802786e-07, "loss": 0.0079, "num_tokens": 29875823.0, "reward": 2.2083334922790527, "reward_std": 0.6110328435897827, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.659709095954895, "sampling/importance_sampling_ratio/mean": 0.9998204112052917, "sampling/importance_sampling_ratio/min": 0.5115570425987244, "sampling/sampling_logp_difference/max": 0.6702961921691895, "sampling/sampling_logp_difference/mean": 0.009898645803332329, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5202.0, "completions/max_terminated_length": 5202.0, "completions/mean_length": 2849.95849609375, "completions/mean_terminated_length": 2849.95849609375, "completions/min_length": 71.0, "completions/min_terminated_length": 71.0, "entropy": 0.37934955209493637, "epoch": 0.2380352644836272, "frac_reward_zero_std": 0.0, "grad_norm": 0.11484263615644176, "kl": 0.0042914621299132705, "learning_rate": 8.672619555889848e-07, "loss": -0.0114, "num_tokens": 29961486.0, "reward": 2.1666667461395264, "reward_std": 0.8140539526939392, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.635912537574768, "sampling/importance_sampling_ratio/mean": 0.9999933838844299, "sampling/importance_sampling_ratio/min": 0.4124099016189575, "sampling/sampling_logp_difference/max": 0.8857375383377075, "sampling/sampling_logp_difference/mean": 0.00921761803328991, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6161.0, "completions/mean_length": 3612.291748046875, "completions/mean_terminated_length": 3413.174072265625, "completions/min_length": 947.0, "completions/min_terminated_length": 947.0, "entropy": 0.3361082449555397, "epoch": 0.23866498740554157, "frac_reward_zero_std": 0.0, "grad_norm": 0.07085770146134723, "kl": 0.003016880655195564, "learning_rate": 8.665900059854125e-07, "loss": 0.0778, "num_tokens": 30076877.0, "reward": 2.25, "reward_std": 0.4857778251171112, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999592900276184, "sampling/importance_sampling_ratio/min": 0.2789054811000824, "sampling/sampling_logp_difference/max": 1.276882290840149, "sampling/sampling_logp_difference/mean": 0.008139772340655327, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5994.0, "completions/max_terminated_length": 5994.0, "completions/mean_length": 2215.25, "completions/mean_terminated_length": 2215.25, "completions/min_length": 568.0, "completions/min_terminated_length": 568.0, "entropy": 0.333541564643383, "epoch": 0.23929471032745592, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10791431688436154, "kl": 0.004127588821575046, "learning_rate": 8.659166216219453e-07, "loss": 0.0206, "num_tokens": 30138739.0, "reward": 2.7083334922790527, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6765587329864502, "sampling/importance_sampling_ratio/mean": 1.000101923942566, "sampling/importance_sampling_ratio/min": 0.6390432715415955, "sampling/sampling_logp_difference/max": 0.5167433023452759, "sampling/sampling_logp_difference/mean": 0.00821358896791935, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7249.0, "completions/mean_length": 3035.0, "completions/mean_terminated_length": 2810.78271484375, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "entropy": 0.3649815768003464, "epoch": 0.23992443324937027, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09178958696755617, "kl": 0.0033229030668735504, "learning_rate": 8.652418051340746e-07, "loss": 0.0633, "num_tokens": 30227875.0, "reward": 2.75, "reward_std": 0.4232262969017029, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.507324457168579, "sampling/importance_sampling_ratio/mean": 0.9999259114265442, "sampling/importance_sampling_ratio/min": 0.6683423519134521, "sampling/sampling_logp_difference/max": 0.4103362560272217, "sampling/sampling_logp_difference/mean": 0.009349497966468334, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4920.0, "completions/max_terminated_length": 4920.0, "completions/mean_length": 2721.416748046875, "completions/mean_terminated_length": 2721.416748046875, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "entropy": 0.43788283318281174, "epoch": 0.24055415617128464, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06043498692419099, "kl": 0.003831459558568895, "learning_rate": 8.645655591628975e-07, "loss": -0.0175, "num_tokens": 30309653.0, "reward": 2.0416667461395264, "reward_std": 0.27817434072494507, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5644444227218628, "sampling/importance_sampling_ratio/mean": 1.0001050233840942, "sampling/importance_sampling_ratio/min": 0.4716321527957916, "sampling/sampling_logp_difference/max": 0.7515559196472168, "sampling/sampling_logp_difference/mean": 0.010326726362109184, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4454.0, "completions/max_terminated_length": 4454.0, "completions/mean_length": 2273.541748046875, "completions/mean_terminated_length": 2273.541748046875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "entropy": 0.29575880616903305, "epoch": 0.241183879093199, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10939065614664668, "kl": 0.0033800284727476537, "learning_rate": 8.638878863551049e-07, "loss": -0.0221, "num_tokens": 30383818.0, "reward": 2.5833334922790527, "reward_std": 0.40627965331077576, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4953738451004028, "sampling/importance_sampling_ratio/mean": 0.9999351501464844, "sampling/importance_sampling_ratio/min": 0.5919371247291565, "sampling/sampling_logp_difference/max": 0.5243549346923828, "sampling/sampling_logp_difference/mean": 0.008034588769078255, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3554.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 1641.8333740234375, "completions/mean_terminated_length": 1641.8333740234375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "entropy": 0.2606827989220619, "epoch": 0.24181360201511334, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09728668169152571, "kl": 0.003917156194802374, "learning_rate": 8.63208789362973e-07, "loss": -0.1285, "num_tokens": 30437110.0, "reward": 2.7916667461395264, "reward_std": 0.29602527618408203, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6880457401275635, "sampling/importance_sampling_ratio/mean": 1.0001168251037598, "sampling/importance_sampling_ratio/min": 0.607323944568634, "sampling/sampling_logp_difference/max": 0.5235714912414551, "sampling/sampling_logp_difference/mean": 0.00690791942179203, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6540.0, "completions/max_terminated_length": 6540.0, "completions/mean_length": 2981.75, "completions/mean_terminated_length": 2981.75, "completions/min_length": 1473.0, "completions/min_terminated_length": 1473.0, "entropy": 0.273178368806839, "epoch": 0.24244332493702772, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0700273349040693, "kl": 0.002915205725003034, "learning_rate": 8.625282708443508e-07, "loss": -0.0537, "num_tokens": 30541704.0, "reward": 2.5, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7749037742614746, "sampling/importance_sampling_ratio/mean": 0.9998607635498047, "sampling/importance_sampling_ratio/min": 0.3858320116996765, "sampling/sampling_logp_difference/max": 0.9523532390594482, "sampling/sampling_logp_difference/mean": 0.007322139106690884, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5734.0, "completions/max_terminated_length": 5734.0, "completions/mean_length": 2454.0, "completions/mean_terminated_length": 2454.0, "completions/min_length": 696.0, "completions/min_terminated_length": 696.0, "entropy": 0.46296437084674835, "epoch": 0.24307304785894207, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10720944788628714, "kl": 0.0045704105868935585, "learning_rate": 8.61846333462652e-07, "loss": 0.0012, "num_tokens": 30614416.0, "reward": 2.375, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3953828811645508, "sampling/importance_sampling_ratio/mean": 0.9999436736106873, "sampling/importance_sampling_ratio/min": 0.48676547408103943, "sampling/sampling_logp_difference/max": 0.7199728488922119, "sampling/sampling_logp_difference/mean": 0.010931137017905712, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4502.0, "completions/max_terminated_length": 4502.0, "completions/mean_length": 1963.3333740234375, "completions/mean_terminated_length": 1963.3333740234375, "completions/min_length": 1117.0, "completions/min_terminated_length": 1117.0, "entropy": 0.3629356399178505, "epoch": 0.24370277078085642, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08760940600631191, "kl": 0.0038105016574263573, "learning_rate": 8.611629798868429e-07, "loss": -0.0084, "num_tokens": 30675176.0, "reward": 2.625, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3632644414901733, "sampling/importance_sampling_ratio/mean": 1.0000252723693848, "sampling/importance_sampling_ratio/min": 0.7032243013381958, "sampling/sampling_logp_difference/max": 0.3520793914794922, "sampling/sampling_logp_difference/mean": 0.0086840083822608, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3550.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 1865.375, "completions/mean_terminated_length": 1865.375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.29491564631462097, "epoch": 0.24433249370277077, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10036582550691356, "kl": 0.0035650263307616115, "learning_rate": 8.604782127914327e-07, "loss": -0.0155, "num_tokens": 30733265.0, "reward": 2.7916667461395264, "reward_std": 0.4023112952709198, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.494578242301941, "sampling/importance_sampling_ratio/mean": 1.0000454187393188, "sampling/importance_sampling_ratio/min": 0.3620733618736267, "sampling/sampling_logp_difference/max": 1.0159084796905518, "sampling/sampling_logp_difference/mean": 0.007849405519664288, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7068.0, "completions/max_terminated_length": 7068.0, "completions/mean_length": 3081.20849609375, "completions/mean_terminated_length": 3081.20849609375, "completions/min_length": 846.0, "completions/min_terminated_length": 846.0, "entropy": 0.4000629037618637, "epoch": 0.24496221662468515, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0853734554912322, "kl": 0.00308474205667153, "learning_rate": 8.597920348564625e-07, "loss": -0.0184, "num_tokens": 30836038.0, "reward": 2.625, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999335408210754, "sampling/importance_sampling_ratio/min": 0.5320566892623901, "sampling/sampling_logp_difference/max": 1.1756372451782227, "sampling/sampling_logp_difference/mean": 0.009822065010666847, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5110.0, "completions/max_terminated_length": 5110.0, "completions/mean_length": 2622.375, "completions/mean_terminated_length": 2622.375, "completions/min_length": 1299.0, "completions/min_terminated_length": 1299.0, "entropy": 0.42516617476940155, "epoch": 0.2455919395465995, "frac_reward_zero_std": 0.0, "grad_norm": 0.09272684776791312, "kl": 0.0037228105356916785, "learning_rate": 8.591044487674954e-07, "loss": -0.0579, "num_tokens": 30911511.0, "reward": 2.6666667461395264, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.761385202407837, "sampling/importance_sampling_ratio/mean": 0.999955952167511, "sampling/importance_sampling_ratio/min": 0.3080613911151886, "sampling/sampling_logp_difference/max": 1.1774561405181885, "sampling/sampling_logp_difference/mean": 0.010109052993357182, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6999.0, "completions/max_terminated_length": 6999.0, "completions/mean_length": 2585.166748046875, "completions/mean_terminated_length": 2585.166748046875, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "entropy": 0.3407868295907974, "epoch": 0.24622166246851385, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11805869386142308, "kl": 0.003632327716331929, "learning_rate": 8.58415457215606e-07, "loss": -0.0492, "num_tokens": 30994219.0, "reward": 2.5416667461395264, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3685972690582275, "sampling/importance_sampling_ratio/mean": 1.0002409219741821, "sampling/importance_sampling_ratio/min": 0.5519925951957703, "sampling/sampling_logp_difference/max": 0.5942206382751465, "sampling/sampling_logp_difference/mean": 0.008531739935278893, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2610.0, "completions/max_terminated_length": 2610.0, "completions/mean_length": 1714.7083740234375, "completions/mean_terminated_length": 1714.7083740234375, "completions/min_length": 1232.0, "completions/min_terminated_length": 1232.0, "entropy": 0.2526017539203167, "epoch": 0.24685138539042822, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06854210126910484, "kl": 0.0037952258717268705, "learning_rate": 8.577250628973689e-07, "loss": -0.0312, "num_tokens": 31047092.0, "reward": 2.9166667461395264, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3217989206314087, "sampling/importance_sampling_ratio/mean": 0.9998249411582947, "sampling/importance_sampling_ratio/min": 0.7720771431922913, "sampling/sampling_logp_difference/max": 0.2789936065673828, "sampling/sampling_logp_difference/mean": 0.006359691731631756, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7694.0, "completions/max_terminated_length": 7694.0, "completions/mean_length": 2435.0, "completions/mean_terminated_length": 2435.0, "completions/min_length": 844.0, "completions/min_terminated_length": 844.0, "entropy": 0.2624824084341526, "epoch": 0.24748110831234257, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07158140640968234, "kl": 0.003618511720560491, "learning_rate": 8.570332685148496e-07, "loss": -0.0271, "num_tokens": 31134924.0, "reward": 2.2916667461395264, "reward_std": 0.4023112952709198, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000152587890625, "sampling/importance_sampling_ratio/min": 0.4186590909957886, "sampling/sampling_logp_difference/max": 1.3058152198791504, "sampling/sampling_logp_difference/mean": 0.0070632509887218475, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7019.0, "completions/max_terminated_length": 7019.0, "completions/mean_length": 3085.916748046875, "completions/mean_terminated_length": 3085.916748046875, "completions/min_length": 1382.0, "completions/min_terminated_length": 1382.0, "entropy": 0.2921256124973297, "epoch": 0.24811083123425692, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07200192960339794, "kl": 0.003469546267297119, "learning_rate": 8.563400767755926e-07, "loss": 0.0172, "num_tokens": 31232866.0, "reward": 2.2916667461395264, "reward_std": 0.4082186222076416, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8608567714691162, "sampling/importance_sampling_ratio/mean": 1.0001320838928223, "sampling/importance_sampling_ratio/min": 0.4452818036079407, "sampling/sampling_logp_difference/max": 0.8090479373931885, "sampling/sampling_logp_difference/mean": 0.007963865995407104, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4673.0, "completions/max_terminated_length": 4673.0, "completions/mean_length": 1862.916748046875, "completions/mean_terminated_length": 1862.916748046875, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "entropy": 0.2826088070869446, "epoch": 0.24874055415617127, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1160837341984308, "kl": 0.004048199916724116, "learning_rate": 8.556454903926118e-07, "loss": 0.1122, "num_tokens": 31294024.0, "reward": 2.875, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.439069151878357, "sampling/importance_sampling_ratio/mean": 1.0000957250595093, "sampling/importance_sampling_ratio/min": 0.5358250141143799, "sampling/sampling_logp_difference/max": 0.6239476203918457, "sampling/sampling_logp_difference/mean": 0.007556803058832884, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6479.0, "completions/max_terminated_length": 6479.0, "completions/mean_length": 2843.70849609375, "completions/mean_terminated_length": 2843.70849609375, "completions/min_length": 997.0, "completions/min_terminated_length": 997.0, "entropy": 0.389358825981617, "epoch": 0.24937027707808565, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0944271803565641, "kl": 0.003543155500665307, "learning_rate": 8.549495120843789e-07, "loss": 0.0563, "num_tokens": 31378129.0, "reward": 2.5416667461395264, "reward_std": 0.367926687002182, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999856173992157, "sampling/importance_sampling_ratio/min": 0.4409363269805908, "sampling/sampling_logp_difference/max": 1.2594795227050781, "sampling/sampling_logp_difference/mean": 0.009759652428328991, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4754.0, "completions/max_terminated_length": 4754.0, "completions/mean_length": 2665.625, "completions/mean_terminated_length": 2665.625, "completions/min_length": 1023.0, "completions/min_terminated_length": 1023.0, "entropy": 0.2958349883556366, "epoch": 0.25, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.05433287398903826, "kl": 0.003001197299454361, "learning_rate": 8.54252144574814e-07, "loss": -0.016, "num_tokens": 31465608.0, "reward": 2.7083334922790527, "reward_std": 0.27817434072494507, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5279598236083984, "sampling/importance_sampling_ratio/mean": 0.9999701380729675, "sampling/importance_sampling_ratio/min": 0.3686315417289734, "sampling/sampling_logp_difference/max": 0.997957706451416, "sampling/sampling_logp_difference/mean": 0.007418825291097164, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7822.0, "completions/mean_length": 4534.125, "completions/mean_terminated_length": 4375.0869140625, "completions/min_length": 2177.0, "completions/min_terminated_length": 2177.0, "entropy": 0.6547581702470779, "epoch": 0.2506297229219144, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08550504345375647, "kl": 0.003689503821078688, "learning_rate": 8.535533905932737e-07, "loss": 0.0511, "num_tokens": 31584187.0, "reward": 2.0833334922790527, "reward_std": 0.5232069492340088, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000159740447998, "sampling/importance_sampling_ratio/min": 0.2236732393503189, "sampling/sampling_logp_difference/max": 1.4975690841674805, "sampling/sampling_logp_difference/mean": 0.014274278655648232, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5063.0, "completions/max_terminated_length": 5063.0, "completions/mean_length": 2919.625, "completions/mean_terminated_length": 2919.625, "completions/min_length": 1474.0, "completions/min_terminated_length": 1474.0, "entropy": 0.38863953202962875, "epoch": 0.2512594458438287, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.056093496312262826, "kl": 0.003454599413089454, "learning_rate": 8.528532528745414e-07, "loss": -0.0543, "num_tokens": 31665778.0, "reward": 2.25, "reward_std": 0.38613972067832947, "rewards/cloze_reward/mean": 0.3333333432674408, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8101941347122192, "sampling/importance_sampling_ratio/mean": 1.0000752210617065, "sampling/importance_sampling_ratio/min": 0.5494928956031799, "sampling/sampling_logp_difference/max": 0.5987594127655029, "sampling/sampling_logp_difference/mean": 0.009637645445764065, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5977.0, "completions/max_terminated_length": 5977.0, "completions/mean_length": 2418.5, "completions/mean_terminated_length": 2418.5, "completions/min_length": 1152.0, "completions/min_terminated_length": 1152.0, "entropy": 0.3424602225422859, "epoch": 0.2518891687657431, "frac_reward_zero_std": 0.0, "grad_norm": 0.09824517742189777, "kl": 0.0037095732986927032, "learning_rate": 8.521517341588159e-07, "loss": -0.0484, "num_tokens": 31741566.0, "reward": 2.625, "reward_std": 0.6138670444488525, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5986859798431396, "sampling/importance_sampling_ratio/mean": 1.0001450777053833, "sampling/importance_sampling_ratio/min": 0.6507557034492493, "sampling/sampling_logp_difference/max": 0.46918201446533203, "sampling/sampling_logp_difference/mean": 0.008574992418289185, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3540.0, "completions/max_terminated_length": 3540.0, "completions/mean_length": 1583.2083740234375, "completions/mean_terminated_length": 1583.2083740234375, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.3525238335132599, "epoch": 0.25251889168765745, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0956697372658836, "kl": 0.0045785545371472836, "learning_rate": 8.51448837191701e-07, "loss": 0.034, "num_tokens": 31790347.0, "reward": 2.5833334922790527, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5670865774154663, "sampling/importance_sampling_ratio/mean": 1.000175952911377, "sampling/importance_sampling_ratio/min": 0.48228761553764343, "sampling/sampling_logp_difference/max": 0.7292146682739258, "sampling/sampling_logp_difference/mean": 0.009130642749369144, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3586.0, "completions/max_terminated_length": 3586.0, "completions/mean_length": 1708.125, "completions/mean_terminated_length": 1708.125, "completions/min_length": 759.0, "completions/min_terminated_length": 759.0, "entropy": 0.27663227170705795, "epoch": 0.2531486146095718, "frac_reward_zero_std": 0.0, "grad_norm": 0.12830133266311017, "kl": 0.0040739934775047, "learning_rate": 8.507445647241946e-07, "loss": -0.0951, "num_tokens": 31857774.0, "reward": 2.4583334922790527, "reward_std": 0.5787960886955261, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9439961910247803, "sampling/importance_sampling_ratio/mean": 1.0000945329666138, "sampling/importance_sampling_ratio/min": 0.6456785202026367, "sampling/sampling_logp_difference/max": 0.6647458076477051, "sampling/sampling_logp_difference/mean": 0.007384301163256168, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3911.0, "completions/max_terminated_length": 3911.0, "completions/mean_length": 2037.625, "completions/mean_terminated_length": 2037.625, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "entropy": 0.29582324996590614, "epoch": 0.25377833753148615, "frac_reward_zero_std": 0.0, "grad_norm": 0.0922500762081576, "kl": 0.0031234349007718265, "learning_rate": 8.500389195126783e-07, "loss": -0.1155, "num_tokens": 31922061.0, "reward": 2.375, "reward_std": 0.5914937257766724, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.587881088256836, "sampling/importance_sampling_ratio/mean": 1.0000954866409302, "sampling/importance_sampling_ratio/min": 0.5137322545051575, "sampling/sampling_logp_difference/max": 0.666053056716919, "sampling/sampling_logp_difference/mean": 0.007658777292817831, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6107.0, "completions/max_terminated_length": 6107.0, "completions/mean_length": 2651.625, "completions/mean_terminated_length": 2651.625, "completions/min_length": 1387.0, "completions/min_terminated_length": 1387.0, "entropy": 0.33685019612312317, "epoch": 0.25440806045340053, "frac_reward_zero_std": 0.0, "grad_norm": 0.10512243656847317, "kl": 0.003961170499678701, "learning_rate": 8.493319043189061e-07, "loss": 0.0597, "num_tokens": 32001044.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4102495908737183, "sampling/importance_sampling_ratio/mean": 1.0001553297042847, "sampling/importance_sampling_ratio/min": 0.329790323972702, "sampling/sampling_logp_difference/max": 1.1092982292175293, "sampling/sampling_logp_difference/mean": 0.008749710395932198, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7025.0, "completions/max_terminated_length": 7025.0, "completions/mean_length": 2836.20849609375, "completions/mean_terminated_length": 2836.20849609375, "completions/min_length": 1052.0, "completions/min_terminated_length": 1052.0, "entropy": 0.5126289054751396, "epoch": 0.25503778337531485, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0982156148568169, "kl": 0.004225271753966808, "learning_rate": 8.48623521909994e-07, "loss": -0.063, "num_tokens": 32078705.0, "reward": 2.5833334922790527, "reward_std": 0.41387641429901123, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4537101984024048, "sampling/importance_sampling_ratio/mean": 0.9997928142547607, "sampling/importance_sampling_ratio/min": 0.6850728392601013, "sampling/sampling_logp_difference/max": 0.37823009490966797, "sampling/sampling_logp_difference/mean": 0.011898769065737724, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5997.0, "completions/max_terminated_length": 5997.0, "completions/mean_length": 3479.125, "completions/mean_terminated_length": 3479.125, "completions/min_length": 1255.0, "completions/min_terminated_length": 1255.0, "entropy": 0.3111003115773201, "epoch": 0.25566750629722923, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0763428033111962, "kl": 0.0032686442136764526, "learning_rate": 8.47913775058409e-07, "loss": 0.0159, "num_tokens": 32185396.0, "reward": 2.5833334922790527, "reward_std": 0.487678587436676, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001941919326782, "sampling/importance_sampling_ratio/min": 0.5696032643318176, "sampling/sampling_logp_difference/max": 1.08066725730896, "sampling/sampling_logp_difference/mean": 0.008520308881998062, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3962.0, "completions/max_terminated_length": 3962.0, "completions/mean_length": 1739.0, "completions/mean_terminated_length": 1739.0, "completions/min_length": 755.0, "completions/min_terminated_length": 755.0, "entropy": 0.37740372866392136, "epoch": 0.25629722921914355, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0881620112317221, "kl": 0.004177744383923709, "learning_rate": 8.472026665419581e-07, "loss": 0.0554, "num_tokens": 32238868.0, "reward": 2.5416667461395264, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.377084732055664, "sampling/importance_sampling_ratio/mean": 0.9999259114265442, "sampling/importance_sampling_ratio/min": 0.623293936252594, "sampling/sampling_logp_difference/max": 0.47273707389831543, "sampling/sampling_logp_difference/mean": 0.009481417015194893, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4432.0, "completions/max_terminated_length": 4432.0, "completions/mean_length": 2080.125, "completions/mean_terminated_length": 2080.125, "completions/min_length": 739.0, "completions/min_terminated_length": 739.0, "entropy": 0.3234003484249115, "epoch": 0.25692695214105793, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0551021431342774, "kl": 0.0038418974145315588, "learning_rate": 8.464901991437775e-07, "loss": -0.0903, "num_tokens": 32303751.0, "reward": 2.9166667461395264, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8110120296478271, "sampling/importance_sampling_ratio/mean": 0.9998793601989746, "sampling/importance_sampling_ratio/min": 0.6522031426429749, "sampling/sampling_logp_difference/max": 0.5938858985900879, "sampling/sampling_logp_difference/mean": 0.008040483109652996, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6353.0, "completions/max_terminated_length": 6353.0, "completions/mean_length": 2565.58349609375, "completions/mean_terminated_length": 2565.58349609375, "completions/min_length": 1114.0, "completions/min_terminated_length": 1114.0, "entropy": 0.35131794959306717, "epoch": 0.2575566750629723, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10002571156963852, "kl": 0.0036665222723968327, "learning_rate": 8.457763756523222e-07, "loss": -0.0446, "num_tokens": 32392485.0, "reward": 2.5416667461395264, "reward_std": 0.4023112952709198, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6225377321243286, "sampling/importance_sampling_ratio/mean": 0.9999440312385559, "sampling/importance_sampling_ratio/min": 0.27643996477127075, "sampling/sampling_logp_difference/max": 1.2857615947723389, "sampling/sampling_logp_difference/mean": 0.009244379587471485, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4958.0, "completions/max_terminated_length": 4958.0, "completions/mean_length": 2444.666748046875, "completions/mean_terminated_length": 2444.666748046875, "completions/min_length": 857.0, "completions/min_terminated_length": 857.0, "entropy": 0.2726333811879158, "epoch": 0.25818639798488663, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.05685104884850267, "kl": 0.003359017486218363, "learning_rate": 8.450611988613545e-07, "loss": -0.1155, "num_tokens": 32484245.0, "reward": 2.75, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000158548355103, "sampling/importance_sampling_ratio/min": 0.326270192861557, "sampling/sampling_logp_difference/max": 1.1200294494628906, "sampling/sampling_logp_difference/mean": 0.007186663802713156, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6436.0, "completions/mean_length": 3817.70849609375, "completions/mean_terminated_length": 3627.521728515625, "completions/min_length": 1667.0, "completions/min_terminated_length": 1667.0, "entropy": 0.5061573088169098, "epoch": 0.258816120906801, "frac_reward_zero_std": 0.0, "grad_norm": 0.09590602247461909, "kl": 0.0037478121230378747, "learning_rate": 8.443446715699331e-07, "loss": 0.0382, "num_tokens": 32590286.0, "reward": 2.25, "reward_std": 0.6381160616874695, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5747489929199219, "sampling/importance_sampling_ratio/mean": 0.9998939633369446, "sampling/importance_sampling_ratio/min": 0.4349980652332306, "sampling/sampling_logp_difference/max": 0.8324136734008789, "sampling/sampling_logp_difference/mean": 0.012181257829070091, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3255.0, "completions/max_terminated_length": 3255.0, "completions/mean_length": 1380.0, "completions/mean_terminated_length": 1380.0, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.3060273751616478, "epoch": 0.2594458438287154, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1212163699715867, "kl": 0.004300453525502235, "learning_rate": 8.436267965824022e-07, "loss": 0.0559, "num_tokens": 32632822.0, "reward": 2.9166667461395264, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2897530794143677, "sampling/importance_sampling_ratio/mean": 0.999941885471344, "sampling/importance_sampling_ratio/min": 0.6203213334083557, "sampling/sampling_logp_difference/max": 0.4775177240371704, "sampling/sampling_logp_difference/mean": 0.007267358712852001, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7973.0, "completions/max_terminated_length": 7973.0, "completions/mean_length": 4042.0, "completions/mean_terminated_length": 4042.0, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.5082273408770561, "epoch": 0.2600755667506297, "frac_reward_zero_std": 0.0, "grad_norm": 0.10168808423780024, "kl": 0.003576478804461658, "learning_rate": 8.429075767083808e-07, "loss": 0.0493, "num_tokens": 32745374.0, "reward": 2.5, "reward_std": 0.5605830550193787, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6239815950393677, "sampling/importance_sampling_ratio/mean": 1.0000396966934204, "sampling/importance_sampling_ratio/min": 0.5867737531661987, "sampling/sampling_logp_difference/max": 0.5331159830093384, "sampling/sampling_logp_difference/mean": 0.012539437040686607, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4287.0, "completions/max_terminated_length": 4287.0, "completions/mean_length": 1694.8333740234375, "completions/mean_terminated_length": 1694.8333740234375, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.2505948133766651, "epoch": 0.2607052896725441, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0648866224093141, "kl": 0.003542486170772463, "learning_rate": 8.42187014762752e-07, "loss": -0.0098, "num_tokens": 32796706.0, "reward": 2.9583334922790527, "reward_std": 0.1178511306643486, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6117935180664062, "sampling/importance_sampling_ratio/mean": 1.0000418424606323, "sampling/importance_sampling_ratio/min": 0.5059456825256348, "sampling/sampling_logp_difference/max": 0.6813259124755859, "sampling/sampling_logp_difference/mean": 0.006622202694416046, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5799.0, "completions/max_terminated_length": 5799.0, "completions/mean_length": 1896.8333740234375, "completions/mean_terminated_length": 1896.8333740234375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.3618685305118561, "epoch": 0.26133501259445846, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11157625377506564, "kl": 0.0041494728066027164, "learning_rate": 8.414651135656505e-07, "loss": 0.0532, "num_tokens": 32851198.0, "reward": 2.7083334922790527, "reward_std": 0.41331955790519714, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4553370475769043, "sampling/importance_sampling_ratio/mean": 1.0001089572906494, "sampling/importance_sampling_ratio/min": 0.4779936969280243, "sampling/sampling_logp_difference/max": 0.7381577491760254, "sampling/sampling_logp_difference/mean": 0.008844821713864803, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3149.0, "completions/max_terminated_length": 3149.0, "completions/mean_length": 1557.25, "completions/mean_terminated_length": 1557.25, "completions/min_length": 951.0, "completions/min_terminated_length": 951.0, "entropy": 0.3024144694209099, "epoch": 0.2619647355163728, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.2169069201005949, "kl": 0.003917759342584759, "learning_rate": 8.407418759424531e-07, "loss": 0.0755, "num_tokens": 32897852.0, "reward": 2.75, "reward_std": 0.42052432894706726, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5749245882034302, "sampling/importance_sampling_ratio/mean": 1.0000396966934204, "sampling/importance_sampling_ratio/min": 0.7713643312454224, "sampling/sampling_logp_difference/max": 0.4542074203491211, "sampling/sampling_logp_difference/mean": 0.007604994811117649, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7680.0, "completions/max_terminated_length": 7680.0, "completions/mean_length": 2490.45849609375, "completions/mean_terminated_length": 2490.45849609375, "completions/min_length": 741.0, "completions/min_terminated_length": 741.0, "entropy": 0.38727016001939774, "epoch": 0.26259445843828716, "frac_reward_zero_std": 0.0, "grad_norm": 0.11957482250685188, "kl": 0.004248477867804468, "learning_rate": 8.400173047237673e-07, "loss": 0.1279, "num_tokens": 32970959.0, "reward": 2.1666667461395264, "reward_std": 0.5232069492340088, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.840181589126587, "sampling/importance_sampling_ratio/mean": 0.9999740719795227, "sampling/importance_sampling_ratio/min": 0.5357444286346436, "sampling/sampling_logp_difference/max": 0.6240980625152588, "sampling/sampling_logp_difference/mean": 0.01015404798090458, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5242.0, "completions/max_terminated_length": 5242.0, "completions/mean_length": 2248.916748046875, "completions/mean_terminated_length": 2248.916748046875, "completions/min_length": 1181.0, "completions/min_terminated_length": 1181.0, "entropy": 0.25381582230329514, "epoch": 0.26322418136020154, "frac_reward_zero_std": 0.0, "grad_norm": 0.11094334187382827, "kl": 0.0037484666099771857, "learning_rate": 8.392914027454197e-07, "loss": 0.078, "num_tokens": 33041077.0, "reward": 2.5833334922790527, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4747542142868042, "sampling/importance_sampling_ratio/mean": 0.9999878406524658, "sampling/importance_sampling_ratio/min": 0.27345556020736694, "sampling/sampling_logp_difference/max": 1.2966161966323853, "sampling/sampling_logp_difference/mean": 0.007269163616001606, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3507.0, "completions/max_terminated_length": 3507.0, "completions/mean_length": 1694.666748046875, "completions/mean_terminated_length": 1694.666748046875, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "entropy": 0.3322603404521942, "epoch": 0.26385390428211586, "frac_reward_zero_std": 0.0, "grad_norm": 0.11812706890018834, "kl": 0.004073342541232705, "learning_rate": 8.385641728484451e-07, "loss": -0.0237, "num_tokens": 33093957.0, "reward": 2.7916667461395264, "reward_std": 0.5078567266464233, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6876384019851685, "sampling/importance_sampling_ratio/mean": 0.9999082088470459, "sampling/importance_sampling_ratio/min": 0.5204079151153564, "sampling/sampling_logp_difference/max": 0.6531423926353455, "sampling/sampling_logp_difference/mean": 0.008422679267823696, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4324.0, "completions/max_terminated_length": 4324.0, "completions/mean_length": 1911.125, "completions/mean_terminated_length": 1911.125, "completions/min_length": 729.0, "completions/min_terminated_length": 729.0, "entropy": 0.3123777434229851, "epoch": 0.26448362720403024, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10763516910655323, "kl": 0.004467605671379715, "learning_rate": 8.378356178790762e-07, "loss": 0.0238, "num_tokens": 33152584.0, "reward": 2.7916667461395264, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.417487621307373, "sampling/importance_sampling_ratio/mean": 0.9998214840888977, "sampling/importance_sampling_ratio/min": 0.6638790965080261, "sampling/sampling_logp_difference/max": 0.40965521335601807, "sampling/sampling_logp_difference/mean": 0.00801766850054264, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6022.0, "completions/max_terminated_length": 6022.0, "completions/mean_length": 3128.791748046875, "completions/mean_terminated_length": 3128.791748046875, "completions/min_length": 1415.0, "completions/min_terminated_length": 1415.0, "entropy": 0.3734581246972084, "epoch": 0.26511335012594456, "frac_reward_zero_std": 0.0, "grad_norm": 0.08625873163352216, "kl": 0.003889659186825156, "learning_rate": 8.371057406887309e-07, "loss": 0.0228, "num_tokens": 33253891.0, "reward": 2.375, "reward_std": 0.5625219941139221, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999397397041321, "sampling/importance_sampling_ratio/min": 0.6140060424804688, "sampling/sampling_logp_difference/max": 0.7670841217041016, "sampling/sampling_logp_difference/mean": 0.009944064542651176, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3670.0, "completions/max_terminated_length": 3670.0, "completions/mean_length": 1535.4583740234375, "completions/mean_terminated_length": 1535.4583740234375, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "entropy": 0.4192938059568405, "epoch": 0.26574307304785894, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08106449886837944, "kl": 0.003861659672111273, "learning_rate": 8.363745441340026e-07, "loss": -0.0721, "num_tokens": 33299294.0, "reward": 2.1666667461395264, "reward_std": 0.30860671401023865, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4292622804641724, "sampling/importance_sampling_ratio/mean": 1.0001314878463745, "sampling/importance_sampling_ratio/min": 0.6557069420814514, "sampling/sampling_logp_difference/max": 0.42204129695892334, "sampling/sampling_logp_difference/mean": 0.009782465174794197, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5909.0, "completions/mean_length": 2301.875, "completions/mean_terminated_length": 2045.7825927734375, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "entropy": 0.4081329479813576, "epoch": 0.2663727959697733, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.13567834704227624, "kl": 0.004774378961883485, "learning_rate": 8.356420310766481e-07, "loss": 0.1455, "num_tokens": 33364875.0, "reward": 2.7916667461395264, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3774396181106567, "sampling/importance_sampling_ratio/mean": 1.0000494718551636, "sampling/importance_sampling_ratio/min": 0.39866936206817627, "sampling/sampling_logp_difference/max": 0.9196228981018066, "sampling/sampling_logp_difference/mean": 0.009400708600878716, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7130.0, "completions/max_terminated_length": 7130.0, "completions/mean_length": 2740.625, "completions/mean_terminated_length": 2740.625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "entropy": 0.3415280357003212, "epoch": 0.26700251889168763, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09937671258174426, "kl": 0.003342093841638416, "learning_rate": 8.349082043835769e-07, "loss": 0.088, "num_tokens": 33441330.0, "reward": 2.7916667461395264, "reward_std": 0.48371022939682007, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000111699104309, "sampling/importance_sampling_ratio/min": 0.6485762000083923, "sampling/sampling_logp_difference/max": 1.5013117790222168, "sampling/sampling_logp_difference/mean": 0.008661140687763691, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7316.0, "completions/max_terminated_length": 7316.0, "completions/mean_length": 3406.75, "completions/mean_terminated_length": 3406.75, "completions/min_length": 1655.0, "completions/min_terminated_length": 1655.0, "entropy": 0.4173753038048744, "epoch": 0.267632241813602, "frac_reward_zero_std": 0.0, "grad_norm": 0.09946413733985784, "kl": 0.0033229702385142446, "learning_rate": 8.341730669268399e-07, "loss": 0.0611, "num_tokens": 33534476.0, "reward": 2.5416667461395264, "reward_std": 0.503990888595581, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.797567367553711, "sampling/importance_sampling_ratio/mean": 1.000065803527832, "sampling/importance_sampling_ratio/min": 0.4201711118221283, "sampling/sampling_logp_difference/max": 0.8670932650566101, "sampling/sampling_logp_difference/mean": 0.009864501655101776, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4900.0, "completions/max_terminated_length": 4900.0, "completions/mean_length": 2631.83349609375, "completions/mean_terminated_length": 2631.83349609375, "completions/min_length": 998.0, "completions/min_terminated_length": 998.0, "entropy": 0.2740399204194546, "epoch": 0.2682619647355164, "frac_reward_zero_std": 0.0, "grad_norm": 0.08548787374034869, "kl": 0.0035537496441975236, "learning_rate": 8.334366215836177e-07, "loss": -0.0582, "num_tokens": 33628536.0, "reward": 2.4166667461395264, "reward_std": 0.4857778251171112, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.510147213935852, "sampling/importance_sampling_ratio/mean": 1.0000487565994263, "sampling/importance_sampling_ratio/min": 0.6031988859176636, "sampling/sampling_logp_difference/max": 0.505508303642273, "sampling/sampling_logp_difference/mean": 0.007501679472625256, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4896.0, "completions/max_terminated_length": 4896.0, "completions/mean_length": 2787.08349609375, "completions/mean_terminated_length": 2787.08349609375, "completions/min_length": 1189.0, "completions/min_terminated_length": 1189.0, "entropy": 0.47736945003271103, "epoch": 0.2688916876574307, "frac_reward_zero_std": 0.0, "grad_norm": 0.19841025412715232, "kl": 0.004959704237990081, "learning_rate": 8.326988712362103e-07, "loss": -0.0143, "num_tokens": 33714266.0, "reward": 2.5, "reward_std": 0.4857778251171112, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999194145202637, "sampling/importance_sampling_ratio/min": 0.00980477686971426, "sampling/sampling_logp_difference/max": 4.624885559082031, "sampling/sampling_logp_difference/mean": 0.011440230533480644, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4195.0, "completions/max_terminated_length": 4195.0, "completions/mean_length": 2532.08349609375, "completions/mean_terminated_length": 2532.08349609375, "completions/min_length": 1091.0, "completions/min_terminated_length": 1091.0, "entropy": 0.5117327496409416, "epoch": 0.2695214105793451, "frac_reward_zero_std": 0.0, "grad_norm": 0.120312940243456, "kl": 0.003696874191518873, "learning_rate": 8.319598187720244e-07, "loss": 0.0863, "num_tokens": 33785708.0, "reward": 2.75, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.855974793434143, "sampling/importance_sampling_ratio/mean": 1.0004104375839233, "sampling/importance_sampling_ratio/min": 0.7160996794700623, "sampling/sampling_logp_difference/max": 0.6184101104736328, "sampling/sampling_logp_difference/mean": 0.01174682006239891, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4397.0, "completions/max_terminated_length": 4397.0, "completions/mean_length": 1964.3333740234375, "completions/mean_terminated_length": 1964.3333740234375, "completions/min_length": 913.0, "completions/min_terminated_length": 913.0, "entropy": 0.2071809023618698, "epoch": 0.27015113350125947, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030210307943028536, "kl": 0.0028593813767656684, "learning_rate": 8.312194670835639e-07, "loss": 0.0001, "num_tokens": 33846588.0, "reward": 3.0, "reward_std": 0.0, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.2698020935058594, "sampling/importance_sampling_ratio/mean": 1.0001258850097656, "sampling/importance_sampling_ratio/min": 0.5638533234596252, "sampling/sampling_logp_difference/max": 0.5729610919952393, "sampling/sampling_logp_difference/mean": 0.005466827657073736, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3691.0, "completions/max_terminated_length": 3691.0, "completions/mean_length": 1496.375, "completions/mean_terminated_length": 1496.375, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "entropy": 0.3249422684311867, "epoch": 0.2707808564231738, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06657657056275054, "kl": 0.003957639273721725, "learning_rate": 8.304778190684167e-07, "loss": -0.0307, "num_tokens": 33891613.0, "reward": 2.25, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4081963300704956, "sampling/importance_sampling_ratio/mean": 1.0001581907272339, "sampling/importance_sampling_ratio/min": 0.7510711550712585, "sampling/sampling_logp_difference/max": 0.34230971336364746, "sampling/sampling_logp_difference/mean": 0.008385210298001766, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5562.0, "completions/max_terminated_length": 5562.0, "completions/mean_length": 2319.875, "completions/mean_terminated_length": 2319.875, "completions/min_length": 589.0, "completions/min_terminated_length": 589.0, "entropy": 0.39711546897888184, "epoch": 0.27141057934508817, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10403558619951916, "kl": 0.003824387094937265, "learning_rate": 8.297348776292449e-07, "loss": 0.0912, "num_tokens": 33958186.0, "reward": 2.75, "reward_std": 0.40627965331077576, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6658225059509277, "sampling/importance_sampling_ratio/mean": 0.9999232292175293, "sampling/importance_sampling_ratio/min": 0.6361899375915527, "sampling/sampling_logp_difference/max": 0.5103189945220947, "sampling/sampling_logp_difference/mean": 0.009730929508805275, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5698.0, "completions/max_terminated_length": 5698.0, "completions/mean_length": 2626.416748046875, "completions/mean_terminated_length": 2626.416748046875, "completions/min_length": 1211.0, "completions/min_terminated_length": 1211.0, "entropy": 0.512130469083786, "epoch": 0.27204030226700254, "frac_reward_zero_std": 0.0, "grad_norm": 0.12780509200955928, "kl": 0.00445619085803628, "learning_rate": 8.289906456737724e-07, "loss": -0.1254, "num_tokens": 34033004.0, "reward": 2.4166667461395264, "reward_std": 0.6269428730010986, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999868273735046, "sampling/importance_sampling_ratio/min": 0.5769509077072144, "sampling/sampling_logp_difference/max": 0.6952329874038696, "sampling/sampling_logp_difference/mean": 0.01184726320207119, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4666.0, "completions/max_terminated_length": 4666.0, "completions/mean_length": 2055.33349609375, "completions/mean_terminated_length": 2055.33349609375, "completions/min_length": 756.0, "completions/min_terminated_length": 756.0, "entropy": 0.4106345921754837, "epoch": 0.27267002518891686, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.09013549978606253, "kl": 0.004528578952886164, "learning_rate": 8.28245126114774e-07, "loss": 0.0327, "num_tokens": 34094812.0, "reward": 2.75, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999317526817322, "sampling/importance_sampling_ratio/min": 0.5232043266296387, "sampling/sampling_logp_difference/max": 2.47293758392334, "sampling/sampling_logp_difference/mean": 0.010435018688440323, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4768.0, "completions/max_terminated_length": 4768.0, "completions/mean_length": 2512.541748046875, "completions/mean_terminated_length": 2512.541748046875, "completions/min_length": 1197.0, "completions/min_terminated_length": 1197.0, "entropy": 0.3874521628022194, "epoch": 0.27329974811083124, "frac_reward_zero_std": 0.0, "grad_norm": 0.09793100172137453, "kl": 0.004037811071611941, "learning_rate": 8.274983218700644e-07, "loss": 0.135, "num_tokens": 34173577.0, "reward": 2.4166667461395264, "reward_std": 0.584453821182251, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6562280654907227, "sampling/importance_sampling_ratio/mean": 0.9998311996459961, "sampling/importance_sampling_ratio/min": 0.7217391729354858, "sampling/sampling_logp_difference/max": 0.5045428276062012, "sampling/sampling_logp_difference/mean": 0.009321743622422218, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5945.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 1904.0833740234375, "completions/mean_terminated_length": 1904.0833740234375, "completions/min_length": 580.0, "completions/min_terminated_length": 580.0, "entropy": 0.4016880616545677, "epoch": 0.2739294710327456, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12772330776311722, "kl": 0.004093067022040486, "learning_rate": 8.267502358624855e-07, "loss": 0.0487, "num_tokens": 34229147.0, "reward": 2.7083334922790527, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3812330961227417, "sampling/importance_sampling_ratio/mean": 0.9998552203178406, "sampling/importance_sampling_ratio/min": 0.6208602786064148, "sampling/sampling_logp_difference/max": 0.47664928436279297, "sampling/sampling_logp_difference/mean": 0.009870462119579315, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4683.0, "completions/max_terminated_length": 4683.0, "completions/mean_length": 2409.875, "completions/mean_terminated_length": 2409.875, "completions/min_length": 613.0, "completions/min_terminated_length": 613.0, "entropy": 0.4159069135785103, "epoch": 0.27455919395465994, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10723099651831995, "kl": 0.00413322780514136, "learning_rate": 8.260008710198965e-07, "loss": 0.0271, "num_tokens": 34308280.0, "reward": 2.0833334922790527, "reward_std": 0.49179765582084656, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7517346143722534, "sampling/importance_sampling_ratio/mean": 0.9998990893363953, "sampling/importance_sampling_ratio/min": 0.5474667549133301, "sampling/sampling_logp_difference/max": 0.6024534702301025, "sampling/sampling_logp_difference/mean": 0.010719671845436096, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6868.0, "completions/max_terminated_length": 6868.0, "completions/mean_length": 3244.58349609375, "completions/mean_terminated_length": 3244.58349609375, "completions/min_length": 405.0, "completions/min_terminated_length": 405.0, "entropy": 0.3450712785124779, "epoch": 0.2751889168765743, "frac_reward_zero_std": 0.0, "grad_norm": 0.07983550297784195, "kl": 0.003888280247338116, "learning_rate": 8.252502302751611e-07, "loss": 0.0407, "num_tokens": 34413030.0, "reward": 1.9166667461395264, "reward_std": 0.7780460715293884, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000518560409546, "sampling/importance_sampling_ratio/min": 0.4967906177043915, "sampling/sampling_logp_difference/max": 1.191253662109375, "sampling/sampling_logp_difference/mean": 0.008417035453021526, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3632.0, "completions/max_terminated_length": 3632.0, "completions/mean_length": 1957.8333740234375, "completions/mean_terminated_length": 1957.8333740234375, "completions/min_length": 770.0, "completions/min_terminated_length": 770.0, "entropy": 0.40098660439252853, "epoch": 0.27581863979848864, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08165991471259454, "kl": 0.004208279773592949, "learning_rate": 8.24498316566137e-07, "loss": 0.0061, "num_tokens": 34473402.0, "reward": 2.5, "reward_std": 0.30860671401023865, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5366482734680176, "sampling/importance_sampling_ratio/mean": 0.9998776912689209, "sampling/importance_sampling_ratio/min": 0.5625910758972168, "sampling/sampling_logp_difference/max": 0.575202226638794, "sampling/sampling_logp_difference/mean": 0.009692663326859474, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4776.0, "completions/max_terminated_length": 4776.0, "completions/mean_length": 2255.70849609375, "completions/mean_terminated_length": 2255.70849609375, "completions/min_length": 801.0, "completions/min_terminated_length": 801.0, "entropy": 0.41718632727861404, "epoch": 0.276448362720403, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09847774949729891, "kl": 0.0038168582832440734, "learning_rate": 8.237451328356639e-07, "loss": -0.0285, "num_tokens": 34542819.0, "reward": 2.6666667461395264, "reward_std": 0.43015047907829285, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999544024467468, "sampling/importance_sampling_ratio/min": 0.6103341579437256, "sampling/sampling_logp_difference/max": 0.7533798217773438, "sampling/sampling_logp_difference/mean": 0.010083291679620743, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3983.0, "completions/max_terminated_length": 3983.0, "completions/mean_length": 1633.916748046875, "completions/mean_terminated_length": 1633.916748046875, "completions/min_length": 714.0, "completions/min_terminated_length": 714.0, "entropy": 0.36284691095352173, "epoch": 0.2770780856423174, "frac_reward_zero_std": 0.0, "grad_norm": 0.09990732324923743, "kl": 0.004199121729470789, "learning_rate": 8.229906820315523e-07, "loss": -0.1247, "num_tokens": 34591105.0, "reward": 2.5, "reward_std": 0.4446708858013153, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3448898792266846, "sampling/importance_sampling_ratio/mean": 0.9999516010284424, "sampling/importance_sampling_ratio/min": 0.678168535232544, "sampling/sampling_logp_difference/max": 0.3883594274520874, "sampling/sampling_logp_difference/mean": 0.009156479500234127, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5798.0, "completions/max_terminated_length": 5798.0, "completions/mean_length": 3332.20849609375, "completions/mean_terminated_length": 3332.20849609375, "completions/min_length": 1830.0, "completions/min_terminated_length": 1830.0, "entropy": 0.2895377650856972, "epoch": 0.2777078085642317, "frac_reward_zero_std": 0.0, "grad_norm": 0.4680095165000097, "kl": 0.0037562515353783965, "learning_rate": 8.222349671065714e-07, "loss": -0.0068, "num_tokens": 34705054.0, "reward": 2.375, "reward_std": 0.5222300291061401, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999606609344482, "sampling/importance_sampling_ratio/min": 0.5351672172546387, "sampling/sampling_logp_difference/max": 0.7218492031097412, "sampling/sampling_logp_difference/mean": 0.007982634007930756, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4705.0, "completions/max_terminated_length": 4705.0, "completions/mean_length": 1993.375, "completions/mean_terminated_length": 1993.375, "completions/min_length": 695.0, "completions/min_terminated_length": 695.0, "entropy": 0.45984286814928055, "epoch": 0.2783375314861461, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.13962748712045345, "kl": 0.003871337277814746, "learning_rate": 8.214779910184384e-07, "loss": 0.1163, "num_tokens": 34761199.0, "reward": 2.5416667461395264, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002764463424683, "sampling/importance_sampling_ratio/min": 0.6326936483383179, "sampling/sampling_logp_difference/max": 0.7421975135803223, "sampling/sampling_logp_difference/mean": 0.011004505679011345, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5758.0, "completions/max_terminated_length": 5758.0, "completions/mean_length": 2748.20849609375, "completions/mean_terminated_length": 2748.20849609375, "completions/min_length": 1122.0, "completions/min_terminated_length": 1122.0, "entropy": 0.42383794486522675, "epoch": 0.27896725440806047, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09125541258576012, "kl": 0.003795831638853997, "learning_rate": 8.207197567298058e-07, "loss": 0.0487, "num_tokens": 34838068.0, "reward": 2.625, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6496798992156982, "sampling/importance_sampling_ratio/mean": 1.0000663995742798, "sampling/importance_sampling_ratio/min": 0.47857019305229187, "sampling/sampling_logp_difference/max": 0.7369524240493774, "sampling/sampling_logp_difference/mean": 0.010079524479806423, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7702.0, "completions/mean_length": 4242.875, "completions/mean_terminated_length": 4071.174072265625, "completions/min_length": 2034.0, "completions/min_terminated_length": 2034.0, "entropy": 0.44771526753902435, "epoch": 0.2795969773299748, "frac_reward_zero_std": 0.0, "grad_norm": 0.09807813748512859, "kl": 0.0035936576896347106, "learning_rate": 8.19960267208251e-07, "loss": 0.1163, "num_tokens": 34968529.0, "reward": 1.9166667461395264, "reward_std": 0.885280966758728, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001683235168457, "sampling/importance_sampling_ratio/min": 0.4532129168510437, "sampling/sampling_logp_difference/max": 1.3553380966186523, "sampling/sampling_logp_difference/mean": 0.010881158523261547, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7510.0, "completions/max_terminated_length": 7510.0, "completions/mean_length": 3385.75, "completions/mean_terminated_length": 3385.75, "completions/min_length": 1397.0, "completions/min_terminated_length": 1397.0, "entropy": 0.31826888024806976, "epoch": 0.28022670025188917, "frac_reward_zero_std": 0.0, "grad_norm": 0.0826418794555228, "kl": 0.0037009049556218088, "learning_rate": 8.19199525426264e-07, "loss": 0.0475, "num_tokens": 35071867.0, "reward": 2.4583334922790527, "reward_std": 0.6199029684066772, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8341987133026123, "sampling/importance_sampling_ratio/mean": 0.9999930262565613, "sampling/importance_sampling_ratio/min": 0.019569585099816322, "sampling/sampling_logp_difference/max": 3.933778762817383, "sampling/sampling_logp_difference/mean": 0.008479899726808071, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7362.0, "completions/max_terminated_length": 7362.0, "completions/mean_length": 3561.70849609375, "completions/mean_terminated_length": 3561.70849609375, "completions/min_length": 895.0, "completions/min_terminated_length": 895.0, "entropy": 0.5858901143074036, "epoch": 0.28085642317380355, "frac_reward_zero_std": 0.0, "grad_norm": 0.10007082752071281, "kl": 0.003893295826856047, "learning_rate": 8.184375343612359e-07, "loss": -0.1566, "num_tokens": 35171260.0, "reward": 2.0416667461395264, "reward_std": 0.6380135416984558, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6104429960250854, "sampling/importance_sampling_ratio/mean": 0.9999759793281555, "sampling/importance_sampling_ratio/min": 0.315903902053833, "sampling/sampling_logp_difference/max": 1.1523172855377197, "sampling/sampling_logp_difference/mean": 0.012708786875009537, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6146.0, "completions/max_terminated_length": 6146.0, "completions/mean_length": 3172.20849609375, "completions/mean_terminated_length": 3172.20849609375, "completions/min_length": 1573.0, "completions/min_terminated_length": 1573.0, "entropy": 0.3875843659043312, "epoch": 0.28148614609571787, "frac_reward_zero_std": 0.0, "grad_norm": 0.11092946630132053, "kl": 0.003960745874792337, "learning_rate": 8.176742969954468e-07, "loss": -0.0392, "num_tokens": 35260329.0, "reward": 2.2083334922790527, "reward_std": 0.7326218485832214, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.890199899673462, "sampling/importance_sampling_ratio/mean": 1.000113606452942, "sampling/importance_sampling_ratio/min": 0.29540568590164185, "sampling/sampling_logp_difference/max": 1.2194056510925293, "sampling/sampling_logp_difference/mean": 0.009599270299077034, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3351.0, "completions/max_terminated_length": 3351.0, "completions/mean_length": 1735.7083740234375, "completions/mean_terminated_length": 1735.7083740234375, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "entropy": 0.310046523809433, "epoch": 0.28211586901763225, "frac_reward_zero_std": 0.0, "grad_norm": 0.10616072950155565, "kl": 0.003914619330316782, "learning_rate": 8.169098163160555e-07, "loss": 0.0308, "num_tokens": 35324266.0, "reward": 2.625, "reward_std": 0.42645785212516785, "rewards/cloze_reward/mean": 0.8333333134651184, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7140085697174072, "sampling/importance_sampling_ratio/mean": 1.0002528429031372, "sampling/importance_sampling_ratio/min": 0.5945418477058411, "sampling/sampling_logp_difference/max": 0.538834810256958, "sampling/sampling_logp_difference/mean": 0.007951509207487106, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7404.0, "completions/max_terminated_length": 7404.0, "completions/mean_length": 3228.75, "completions/mean_terminated_length": 3228.75, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.33675171434879303, "epoch": 0.2827455919395466, "frac_reward_zero_std": 0.0, "grad_norm": 0.11436232091079365, "kl": 0.00323564896825701, "learning_rate": 8.16144095315086e-07, "loss": -0.0634, "num_tokens": 35426436.0, "reward": 2.0833334922790527, "reward_std": 0.5260698199272156, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.8789031505584717, "sampling/importance_sampling_ratio/mean": 0.9999518394470215, "sampling/importance_sampling_ratio/min": 0.19022683799266815, "sampling/sampling_logp_difference/max": 1.6595380306243896, "sampling/sampling_logp_difference/mean": 0.009452848695218563, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6870.0, "completions/max_terminated_length": 6870.0, "completions/mean_length": 4452.25, "completions/mean_terminated_length": 4452.25, "completions/min_length": 2513.0, "completions/min_terminated_length": 2513.0, "entropy": 0.4517928212881088, "epoch": 0.28337531486146095, "frac_reward_zero_std": 0.0, "grad_norm": 0.09027329755970685, "kl": 0.003740563173778355, "learning_rate": 8.153771369894169e-07, "loss": -0.0111, "num_tokens": 35555466.0, "reward": 1.9583333730697632, "reward_std": 0.7192515730857849, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.8117022514343262, "sampling/importance_sampling_ratio/mean": 0.9999501705169678, "sampling/importance_sampling_ratio/min": 0.26117825508117676, "sampling/sampling_logp_difference/max": 1.3425521850585938, "sampling/sampling_logp_difference/mean": 0.010959906503558159, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3272.0, "completions/max_terminated_length": 3272.0, "completions/mean_length": 1881.125, "completions/mean_terminated_length": 1881.125, "completions/min_length": 1104.0, "completions/min_terminated_length": 1104.0, "entropy": 0.4481332376599312, "epoch": 0.2840050377833753, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.3074195440838969, "kl": 0.004264648130629212, "learning_rate": 8.1460894434077e-07, "loss": 0.0763, "num_tokens": 35610661.0, "reward": 2.8333334922790527, "reward_std": 0.36585909128189087, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6605839729309082, "sampling/importance_sampling_ratio/mean": 0.9999289512634277, "sampling/importance_sampling_ratio/min": 0.7194344401359558, "sampling/sampling_logp_difference/max": 0.5071693658828735, "sampling/sampling_logp_difference/mean": 0.010438035242259502, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7817.0, "completions/max_terminated_length": 7817.0, "completions/mean_length": 3191.25, "completions/mean_terminated_length": 3191.25, "completions/min_length": 1394.0, "completions/min_terminated_length": 1394.0, "entropy": 0.40092625468969345, "epoch": 0.28463476070528965, "frac_reward_zero_std": 0.0, "grad_norm": 0.0865988708389453, "kl": 0.004119562450796366, "learning_rate": 8.138395203756972e-07, "loss": 0.0849, "num_tokens": 35709763.0, "reward": 2.2083334922790527, "reward_std": 0.5699716806411743, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.670108675956726, "sampling/importance_sampling_ratio/mean": 1.0000123977661133, "sampling/importance_sampling_ratio/min": 0.030437881126999855, "sampling/sampling_logp_difference/max": 3.492067337036133, "sampling/sampling_logp_difference/mean": 0.009410005994141102, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6135.0, "completions/max_terminated_length": 6135.0, "completions/mean_length": 2790.83349609375, "completions/mean_terminated_length": 2790.83349609375, "completions/min_length": 1262.0, "completions/min_terminated_length": 1262.0, "entropy": 0.38379764556884766, "epoch": 0.285264483627204, "frac_reward_zero_std": 0.0, "grad_norm": 0.09587022524207256, "kl": 0.004008870280813426, "learning_rate": 8.130688681055698e-07, "loss": 0.1022, "num_tokens": 35794087.0, "reward": 2.0833334922790527, "reward_std": 0.6040751338005066, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.7552989721298218, "sampling/importance_sampling_ratio/mean": 0.9997716546058655, "sampling/importance_sampling_ratio/min": 0.6093762516975403, "sampling/sampling_logp_difference/max": 0.5626392364501953, "sampling/sampling_logp_difference/mean": 0.009634492918848991, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3718.0, "completions/max_terminated_length": 3718.0, "completions/mean_length": 2432.45849609375, "completions/mean_terminated_length": 2432.45849609375, "completions/min_length": 1346.0, "completions/min_terminated_length": 1346.0, "entropy": 0.3428121507167816, "epoch": 0.2858942065491184, "frac_reward_zero_std": 0.0, "grad_norm": 0.09469779288844103, "kl": 0.004036265832837671, "learning_rate": 8.122969905465667e-07, "loss": 0.0189, "num_tokens": 35870490.0, "reward": 2.3333334922790527, "reward_std": 0.6821267604827881, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6734117269515991, "sampling/importance_sampling_ratio/mean": 1.0000239610671997, "sampling/importance_sampling_ratio/min": 0.6454455256462097, "sampling/sampling_logp_difference/max": 0.514864444732666, "sampling/sampling_logp_difference/mean": 0.00866326317191124, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7479.0, "completions/max_terminated_length": 7479.0, "completions/mean_length": 2724.5, "completions/mean_terminated_length": 2724.5, "completions/min_length": 878.0, "completions/min_terminated_length": 878.0, "entropy": 0.34496185928583145, "epoch": 0.2865239294710327, "frac_reward_zero_std": 0.0, "grad_norm": 0.09798175265800486, "kl": 0.003772095951717347, "learning_rate": 8.115238907196619e-07, "loss": 0.0261, "num_tokens": 35955910.0, "reward": 2.5, "reward_std": 0.4714045226573944, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4115546941757202, "sampling/importance_sampling_ratio/mean": 0.9999887943267822, "sampling/importance_sampling_ratio/min": 0.6380051374435425, "sampling/sampling_logp_difference/max": 0.44940900802612305, "sampling/sampling_logp_difference/mean": 0.008735327050089836, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4932.0, "completions/max_terminated_length": 4932.0, "completions/mean_length": 2555.33349609375, "completions/mean_terminated_length": 2555.33349609375, "completions/min_length": 865.0, "completions/min_terminated_length": 865.0, "entropy": 0.5187385976314545, "epoch": 0.2871536523929471, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1016180151745543, "kl": 0.004491201136261225, "learning_rate": 8.107495716506137e-07, "loss": -0.0647, "num_tokens": 36032958.0, "reward": 2.0416667461395264, "reward_std": 0.46798479557037354, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4615967273712158, "sampling/importance_sampling_ratio/mean": 1.0001744031906128, "sampling/importance_sampling_ratio/min": 0.4989124834537506, "sampling/sampling_logp_difference/max": 0.6953246593475342, "sampling/sampling_logp_difference/mean": 0.012213438749313354, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2459.0, "completions/max_terminated_length": 2459.0, "completions/mean_length": 1477.75, "completions/mean_terminated_length": 1477.75, "completions/min_length": 840.0, "completions/min_terminated_length": 840.0, "entropy": 0.34376389533281326, "epoch": 0.2877833753148615, "frac_reward_zero_std": 0.0, "grad_norm": 0.11317303091676068, "kl": 0.003911053529009223, "learning_rate": 8.099740363699516e-07, "loss": -0.0948, "num_tokens": 36076656.0, "reward": 2.375, "reward_std": 0.6055297255516052, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.6607493162155151, "sampling/importance_sampling_ratio/mean": 0.9999192357063293, "sampling/importance_sampling_ratio/min": 0.7727089524269104, "sampling/sampling_logp_difference/max": 0.5072689056396484, "sampling/sampling_logp_difference/mean": 0.008033815771341324, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6367.0, "completions/max_terminated_length": 6367.0, "completions/mean_length": 2432.375, "completions/mean_terminated_length": 2432.375, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "entropy": 0.33834249526262283, "epoch": 0.2884130982367758, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08595263447958104, "kl": 0.004109038971364498, "learning_rate": 8.091972879129657e-07, "loss": -0.0537, "num_tokens": 36146865.0, "reward": 2.7083334922790527, "reward_std": 0.1178511306643486, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.6921405792236328, "sampling/importance_sampling_ratio/mean": 0.9999629855155945, "sampling/importance_sampling_ratio/min": 0.6853944063186646, "sampling/sampling_logp_difference/max": 0.5259943008422852, "sampling/sampling_logp_difference/mean": 0.008916031569242477, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7868.0, "completions/max_terminated_length": 7868.0, "completions/mean_length": 3975.291748046875, "completions/mean_terminated_length": 3975.291748046875, "completions/min_length": 1214.0, "completions/min_terminated_length": 1214.0, "entropy": 0.45537786930799484, "epoch": 0.2890428211586902, "frac_reward_zero_std": 0.0, "grad_norm": 0.09780776633135113, "kl": 0.003747817245312035, "learning_rate": 8.084193293196939e-07, "loss": -0.0715, "num_tokens": 36260280.0, "reward": 2.375, "reward_std": 0.6106518507003784, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.7882781028747559, "sampling/importance_sampling_ratio/mean": 0.9999892711639404, "sampling/importance_sampling_ratio/min": 0.4561805725097656, "sampling/sampling_logp_difference/max": 0.7848665714263916, "sampling/sampling_logp_difference/mean": 0.010784702375531197, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6952.0, "completions/mean_length": 3438.95849609375, "completions/mean_terminated_length": 3232.304443359375, "completions/min_length": 930.0, "completions/min_terminated_length": 930.0, "entropy": 0.3363035097718239, "epoch": 0.28967254408060455, "frac_reward_zero_std": 0.0, "grad_norm": 0.08580846087942712, "kl": 0.0034169284044764936, "learning_rate": 8.076401636349103e-07, "loss": 0.0675, "num_tokens": 36360575.0, "reward": 1.875, "reward_std": 0.5480016469955444, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5463573932647705, "sampling/importance_sampling_ratio/mean": 1.0000343322753906, "sampling/importance_sampling_ratio/min": 0.5162748098373413, "sampling/sampling_logp_difference/max": 0.6611161231994629, "sampling/sampling_logp_difference/mean": 0.008595800027251244, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4147.0, "completions/max_terminated_length": 4147.0, "completions/mean_length": 1899.541748046875, "completions/mean_terminated_length": 1899.541748046875, "completions/min_length": 903.0, "completions/min_terminated_length": 903.0, "entropy": 0.3192397579550743, "epoch": 0.2903022670025189, "frac_reward_zero_std": 0.0, "grad_norm": 0.10121637055908884, "kl": 0.0034925767104141414, "learning_rate": 8.068597939081133e-07, "loss": -0.082, "num_tokens": 36423084.0, "reward": 2.5416667461395264, "reward_std": 0.5383754968643188, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.733059287071228, "sampling/importance_sampling_ratio/mean": 0.9999735951423645, "sampling/importance_sampling_ratio/min": 0.37665292620658875, "sampling/sampling_logp_difference/max": 0.976431131362915, "sampling/sampling_logp_difference/mean": 0.0078119016252458096, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6105.0, "completions/max_terminated_length": 6105.0, "completions/mean_length": 4173.58349609375, "completions/mean_terminated_length": 4173.58349609375, "completions/min_length": 2500.0, "completions/min_terminated_length": 2500.0, "entropy": 0.23951486498117447, "epoch": 0.29093198992443325, "frac_reward_zero_std": 0.0, "grad_norm": 0.05856756401441714, "kl": 0.003225448541343212, "learning_rate": 8.060782231935142e-07, "loss": -0.0368, "num_tokens": 36558410.0, "reward": 2.25, "reward_std": 0.7558646202087402, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.731670618057251, "sampling/importance_sampling_ratio/mean": 0.9999209046363831, "sampling/importance_sampling_ratio/min": 0.458251416683197, "sampling/sampling_logp_difference/max": 0.7803373336791992, "sampling/sampling_logp_difference/mean": 0.0065147764980793, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3369.0, "completions/max_terminated_length": 3369.0, "completions/mean_length": 2204.83349609375, "completions/mean_terminated_length": 2204.83349609375, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.2964962050318718, "epoch": 0.29156171284634763, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07117334602090761, "kl": 0.0036956085241399705, "learning_rate": 8.052954545500238e-07, "loss": -0.0379, "num_tokens": 36628846.0, "reward": 2.7916667461395264, "reward_std": 0.3698274493217468, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.3934428691864014, "sampling/importance_sampling_ratio/mean": 1.0000327825546265, "sampling/importance_sampling_ratio/min": 0.687842071056366, "sampling/sampling_logp_difference/max": 0.37419605255126953, "sampling/sampling_logp_difference/mean": 0.007772968150675297, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6274.0, "completions/max_terminated_length": 6274.0, "completions/mean_length": 3554.0, "completions/mean_terminated_length": 3554.0, "completions/min_length": 1571.0, "completions/min_terminated_length": 1571.0, "entropy": 0.30552127212285995, "epoch": 0.29219143576826195, "frac_reward_zero_std": 0.0, "grad_norm": 0.07835446633250218, "kl": 0.003332510415930301, "learning_rate": 8.045114910412422e-07, "loss": -0.0567, "num_tokens": 36746574.0, "reward": 2.1666667461395264, "reward_std": 0.6461009979248047, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999308586120605, "sampling/importance_sampling_ratio/min": 0.05507952347397804, "sampling/sampling_logp_difference/max": 2.898977279663086, "sampling/sampling_logp_difference/mean": 0.00830184668302536, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4904.0, "completions/max_terminated_length": 4904.0, "completions/mean_length": 2273.70849609375, "completions/mean_terminated_length": 2273.70849609375, "completions/min_length": 884.0, "completions/min_terminated_length": 884.0, "entropy": 0.3482123166322708, "epoch": 0.29282115869017633, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08665636259909591, "kl": 0.0033854350331239402, "learning_rate": 8.037263357354455e-07, "loss": -0.0875, "num_tokens": 36811303.0, "reward": 2.8333334922790527, "reward_std": 0.2903675436973572, "rewards/cloze_reward/mean": 1.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.9865214824676514, "sampling/importance_sampling_ratio/mean": 1.0000392198562622, "sampling/importance_sampling_ratio/min": 0.4345625042915344, "sampling/sampling_logp_difference/max": 0.8334155082702637, "sampling/sampling_logp_difference/mean": 0.008683294989168644, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3056.0, "completions/max_terminated_length": 3056.0, "completions/mean_length": 1601.75, "completions/mean_terminated_length": 1601.75, "completions/min_length": 545.0, "completions/min_terminated_length": 545.0, "entropy": 0.29703111946582794, "epoch": 0.29345088161209065, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0820754449375741, "kl": 0.004446884791832417, "learning_rate": 8.029399917055745e-07, "loss": 0.0551, "num_tokens": 36862425.0, "reward": 2.9166667461395264, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4618556499481201, "sampling/importance_sampling_ratio/mean": 0.9997608065605164, "sampling/importance_sampling_ratio/min": 0.6950784921646118, "sampling/sampling_logp_difference/max": 0.37970662117004395, "sampling/sampling_logp_difference/mean": 0.007302212528884411, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4425.0, "completions/max_terminated_length": 4425.0, "completions/mean_length": 2156.916748046875, "completions/mean_terminated_length": 2156.916748046875, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "entropy": 0.32163646072149277, "epoch": 0.29408060453400503, "frac_reward_zero_std": 0.0, "grad_norm": 0.14082033515758183, "kl": 0.0040096743032336235, "learning_rate": 8.021524620292222e-07, "loss": -0.0371, "num_tokens": 36931991.0, "reward": 2.7083334922790527, "reward_std": 0.48112308979034424, "rewards/cloze_reward/mean": 0.875, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5985602140426636, "sampling/importance_sampling_ratio/mean": 1.00003182888031, "sampling/importance_sampling_ratio/min": 0.6649131178855896, "sampling/sampling_logp_difference/max": 0.4691033363342285, "sampling/sampling_logp_difference/mean": 0.008092537522315979, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8007.0, "completions/max_terminated_length": 8007.0, "completions/mean_length": 2895.45849609375, "completions/mean_terminated_length": 2895.45849609375, "completions/min_length": 926.0, "completions/min_terminated_length": 926.0, "entropy": 0.44303108006715775, "epoch": 0.2947103274559194, "frac_reward_zero_std": 0.0, "grad_norm": 0.0960072103673488, "kl": 0.004399741650559008, "learning_rate": 8.01363749788622e-07, "loss": -0.1454, "num_tokens": 37013474.0, "reward": 2.4166667461395264, "reward_std": 0.7288839221000671, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.312534213066101, "sampling/importance_sampling_ratio/mean": 1.0000110864639282, "sampling/importance_sampling_ratio/min": 0.5683128237724304, "sampling/sampling_logp_difference/max": 0.5650832653045654, "sampling/sampling_logp_difference/mean": 0.010873678140342236, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6823.0, "completions/max_terminated_length": 6823.0, "completions/mean_length": 2377.125, "completions/mean_terminated_length": 2377.125, "completions/min_length": 1345.0, "completions/min_terminated_length": 1345.0, "entropy": 0.4060232639312744, "epoch": 0.29534005037783373, "frac_reward_zero_std": 0.0, "grad_norm": 0.1107787764246954, "kl": 0.004488682723604143, "learning_rate": 8.00573858070636e-07, "loss": 0.0253, "num_tokens": 37086581.0, "reward": 2.0416667461395264, "reward_std": 0.8496841192245483, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.8152070045471191, "sampling/importance_sampling_ratio/mean": 1.0000262260437012, "sampling/importance_sampling_ratio/min": 0.40378615260124207, "sampling/sampling_logp_difference/max": 0.9068698883056641, "sampling/sampling_logp_difference/mean": 0.010264229960739613, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4912.0, "completions/max_terminated_length": 4912.0, "completions/mean_length": 2079.83349609375, "completions/mean_terminated_length": 2079.83349609375, "completions/min_length": 1161.0, "completions/min_terminated_length": 1161.0, "entropy": 0.2692866250872612, "epoch": 0.2959697732997481, "frac_reward_zero_std": 0.0, "grad_norm": 0.11898211739948449, "kl": 0.0037799873389303684, "learning_rate": 7.997827899667422e-07, "loss": 0.0646, "num_tokens": 37157849.0, "reward": 2.875, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.661749005317688, "sampling/importance_sampling_ratio/mean": 0.9999554753303528, "sampling/importance_sampling_ratio/min": 0.6820181012153625, "sampling/sampling_logp_difference/max": 0.5078706741333008, "sampling/sampling_logp_difference/mean": 0.00705000339075923, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6791.0, "completions/max_terminated_length": 6791.0, "completions/mean_length": 3348.541748046875, "completions/mean_terminated_length": 3348.541748046875, "completions/min_length": 880.0, "completions/min_terminated_length": 880.0, "entropy": 0.45644184947013855, "epoch": 0.2965994962216625, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0855037159841356, "kl": 0.004353415803052485, "learning_rate": 7.989905485730225e-07, "loss": -0.0242, "num_tokens": 37253214.0, "reward": 2.5416667461395264, "reward_std": 0.2721545100212097, "rewards/cloze_reward/mean": 0.9166666865348816, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.400863528251648, "sampling/importance_sampling_ratio/mean": 0.9998080134391785, "sampling/importance_sampling_ratio/min": 0.5714393258094788, "sampling/sampling_logp_difference/max": 0.5595970153808594, "sampling/sampling_logp_difference/mean": 0.010798253118991852, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3171.0, "completions/max_terminated_length": 3171.0, "completions/mean_length": 1543.0833740234375, "completions/mean_terminated_length": 1543.0833740234375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.37171996384859085, "epoch": 0.2972292191435768, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1052208039438181, "kl": 0.005156719242222607, "learning_rate": 7.981971369901513e-07, "loss": -0.0756, "num_tokens": 37306904.0, "reward": 2.6666667461395264, "reward_std": 0.3314744830131531, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.5435336828231812, "sampling/importance_sampling_ratio/mean": 1.0000331401824951, "sampling/importance_sampling_ratio/min": 0.4468894898891449, "sampling/sampling_logp_difference/max": 0.8054440021514893, "sampling/sampling_logp_difference/mean": 0.009682286530733109, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 1748.8333740234375, "completions/mean_terminated_length": 1748.8333740234375, "completions/min_length": 673.0, "completions/min_terminated_length": 673.0, "entropy": 0.2934280335903168, "epoch": 0.2978589420654912, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08481522775509065, "kl": 0.004948131740093231, "learning_rate": 7.974025583233828e-07, "loss": -0.073, "num_tokens": 37367652.0, "reward": 2.75, "reward_std": 0.40627965331077576, "rewards/cloze_reward/mean": 0.9583333134651184, "rewards/cloze_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.4412509202957153, "sampling/importance_sampling_ratio/mean": 0.9999983906745911, "sampling/importance_sampling_ratio/min": 0.38870006799697876, "sampling/sampling_logp_difference/max": 0.9449472427368164, "sampling/sampling_logp_difference/mean": 0.0076422556303441525, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7010.0, "completions/max_terminated_length": 7010.0, "completions/mean_length": 2862.125, "completions/mean_terminated_length": 2862.125, "completions/min_length": 788.0, "completions/min_terminated_length": 788.0, "entropy": 0.3120402917265892, "epoch": 0.29848866498740556, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08138576849335058, "kl": 0.003946125041693449, "learning_rate": 7.966068156825387e-07, "loss": -0.0118, "num_tokens": 37459031.0, "reward": 2.375, "reward_std": 0.3506905436515808, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5260642766952515, "sampling/importance_sampling_ratio/mean": 1.0000439882278442, "sampling/importance_sampling_ratio/min": 0.47579073905944824, "sampling/sampling_logp_difference/max": 0.7427771091461182, "sampling/sampling_logp_difference/mean": 0.00815754197537899, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4440.0, "completions/max_terminated_length": 4440.0, "completions/mean_length": 1598.125, "completions/mean_terminated_length": 1598.125, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "entropy": 0.36152276396751404, "epoch": 0.2991183879093199, "frac_reward_zero_std": 0.0, "grad_norm": 0.15127808940095072, "kl": 0.005406708689406514, "learning_rate": 7.958099121819966e-07, "loss": 0.1784, "num_tokens": 37512642.0, "reward": 2.4166667461395264, "reward_std": 0.5201624631881714, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3407889604568481, "sampling/importance_sampling_ratio/mean": 0.999690592288971, "sampling/importance_sampling_ratio/min": 0.762846827507019, "sampling/sampling_logp_difference/max": 0.2932581901550293, "sampling/sampling_logp_difference/mean": 0.009310301393270493, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3856.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 1717.875, "completions/mean_terminated_length": 1717.875, "completions/min_length": 748.0, "completions/min_terminated_length": 748.0, "entropy": 0.34719162434339523, "epoch": 0.29974811083123426, "frac_reward_zero_std": 0.0, "grad_norm": 0.13188407515330647, "kl": 0.004439172684215009, "learning_rate": 7.950118509406769e-07, "loss": -0.0001, "num_tokens": 37563599.0, "reward": 2.2916667461395264, "reward_std": 0.5863928198814392, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.418663740158081, "sampling/importance_sampling_ratio/mean": 0.9999856352806091, "sampling/importance_sampling_ratio/min": 0.6429253816604614, "sampling/sampling_logp_difference/max": 0.4417266845703125, "sampling/sampling_logp_difference/mean": 0.008619530126452446, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3960.0, "completions/max_terminated_length": 3960.0, "completions/mean_length": 2100.125, "completions/mean_terminated_length": 2100.125, "completions/min_length": 767.0, "completions/min_terminated_length": 767.0, "entropy": 0.34835384786129, "epoch": 0.30037783375314864, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09593824087669298, "kl": 0.003915244247764349, "learning_rate": 7.942126350820317e-07, "loss": -0.0521, "num_tokens": 37628562.0, "reward": 2.125, "reward_std": 0.3268197476863861, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.4716689586639404, "sampling/importance_sampling_ratio/mean": 1.0000605583190918, "sampling/importance_sampling_ratio/min": 0.5677284598350525, "sampling/sampling_logp_difference/max": 0.5661120414733887, "sampling/sampling_logp_difference/mean": 0.008735178038477898, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5291.0, "completions/max_terminated_length": 5291.0, "completions/mean_length": 1914.75, "completions/mean_terminated_length": 1914.75, "completions/min_length": 720.0, "completions/min_terminated_length": 720.0, "entropy": 0.2947171665728092, "epoch": 0.30100755667506296, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08848557440625444, "kl": 0.004553494276478887, "learning_rate": 7.93412267734032e-07, "loss": 0.0267, "num_tokens": 37688108.0, "reward": 2.5416667461395264, "reward_std": 0.3535533845424652, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148510992527008, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998130202293396, "sampling/importance_sampling_ratio/min": 0.5370738506317139, "sampling/sampling_logp_difference/max": 0.763153076171875, "sampling/sampling_logp_difference/mean": 0.00789311621338129, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7063.0, "completions/max_terminated_length": 7063.0, "completions/mean_length": 2598.375, "completions/mean_terminated_length": 2598.375, "completions/min_length": 1109.0, "completions/min_terminated_length": 1109.0, "entropy": 0.4186927601695061, "epoch": 0.30163727959697734, "frac_reward_zero_std": 0.0, "grad_norm": 0.11307478230195192, "kl": 0.004626343143172562, "learning_rate": 7.926107520291554e-07, "loss": 0.0468, "num_tokens": 37761325.0, "reward": 1.875, "reward_std": 0.5383754968643188, "rewards/cloze_reward/mean": 0.2916666567325592, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4421736001968384, "sampling/importance_sampling_ratio/mean": 1.0000361204147339, "sampling/importance_sampling_ratio/min": 0.6278385519981384, "sampling/sampling_logp_difference/max": 0.4654722213745117, "sampling/sampling_logp_difference/mean": 0.010732954367995262, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7142.0, "completions/max_terminated_length": 7142.0, "completions/mean_length": 3251.291748046875, "completions/mean_terminated_length": 3251.291748046875, "completions/min_length": 1583.0, "completions/min_terminated_length": 1583.0, "entropy": 0.34509994089603424, "epoch": 0.3022670025188917, "frac_reward_zero_std": 0.0, "grad_norm": 0.09659608864012444, "kl": 0.0034701244439929724, "learning_rate": 7.918080911043736e-07, "loss": 0.0336, "num_tokens": 37880852.0, "reward": 2.625, "reward_std": 0.47419947385787964, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 1.447143316268921, "sampling/importance_sampling_ratio/mean": 0.9998149275779724, "sampling/importance_sampling_ratio/min": 0.5415821671485901, "sampling/sampling_logp_difference/max": 0.6132605075836182, "sampling/sampling_logp_difference/mean": 0.008426077663898468, "step": 480 } ], "logging_steps": 1.0, "max_steps": 1588, "num_input_tokens_seen": 37880852, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }