{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.05509641873278237, "eval_steps": 500, "global_step": 40, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2915.0, "completions/max_terminated_length": 2915.0, "completions/mean_length": 1900.916748046875, "completions/mean_terminated_length": 1900.916748046875, "completions/min_length": 1153.0, "completions/min_terminated_length": 1153.0, "entropy": 0.29732953757047653, "epoch": 0.0013774104683195593, "frac_reward_zero_std": 0.0, "grad_norm": 0.2848003042646487, "kl": 0.0, "learning_rate": 1e-06, "loss": 0.065, "num_tokens": 62470.0, "reward": 1.875, "reward_std": 0.7653362154960632, "rewards/cloze_reward/mean": 0.25, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "rewards/format_reward/mean": 0.7083333134651184, "rewards/format_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0003361701965332, "sampling/importance_sampling_ratio/min": 0.12010933458805084, "sampling/sampling_logp_difference/max": 2.1193528175354004, "sampling/sampling_logp_difference/mean": 0.013728929683566093, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4209.0, "completions/max_terminated_length": 4209.0, "completions/mean_length": 1807.125, "completions/mean_terminated_length": 1807.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.4276350364089012, "epoch": 0.0027548209366391185, "frac_reward_zero_std": 0.0, "grad_norm": 0.27170591036025904, "kl": 0.0017779993431759067, "learning_rate": 9.999953187068845e-07, "loss": -0.0052, "num_tokens": 144433.0, "reward": 1.375, "reward_std": 0.697779655456543, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 0.5416666865348816, "rewards/format_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000282526016235, "sampling/importance_sampling_ratio/min": 0.054640088230371475, "sampling/sampling_logp_difference/max": 2.906987428665161, "sampling/sampling_logp_difference/mean": 0.014421815052628517, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5955.0, "completions/max_terminated_length": 5955.0, "completions/mean_length": 1929.8333740234375, "completions/mean_terminated_length": 1929.8333740234375, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "entropy": 0.15682650730013847, "epoch": 0.004132231404958678, "frac_reward_zero_std": 0.0, "grad_norm": 0.23379512087809812, "kl": 0.00041251888615079224, "learning_rate": 9.999812749151967e-07, "loss": 0.0655, "num_tokens": 208821.0, "reward": 2.375, "reward_std": 0.9505624771118164, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "rewards/format_reward/mean": 0.7916666865348816, "rewards/format_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001002550125122, "sampling/importance_sampling_ratio/min": 0.04702043533325195, "sampling/sampling_logp_difference/max": 3.057173013687134, "sampling/sampling_logp_difference/mean": 0.008513483218848705, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6342.0, "completions/max_terminated_length": 6342.0, "completions/mean_length": 3278.5, "completions/mean_terminated_length": 3278.5, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.23897437751293182, "epoch": 0.005509641873278237, "frac_reward_zero_std": 0.0, "grad_norm": 0.16457158766091876, "kl": 0.0004863361318712123, "learning_rate": 9.999578688879084e-07, "loss": 0.0713, "num_tokens": 306505.0, "reward": 2.1666667461395264, "reward_std": 0.6947464346885681, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001260042190552, "sampling/importance_sampling_ratio/min": 0.1858593225479126, "sampling/sampling_logp_difference/max": 1.682765245437622, "sampling/sampling_logp_difference/mean": 0.012356276623904705, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5875.0, "completions/mean_length": 2768.08349609375, "completions/mean_terminated_length": 2275.0, "completions/min_length": 772.0, "completions/min_terminated_length": 772.0, "entropy": 0.30327995866537094, "epoch": 0.006887052341597796, "frac_reward_zero_std": 0.0, "grad_norm": 0.24551726103614221, "kl": 0.000520530593348667, "learning_rate": 9.999251010633018e-07, "loss": 0.1599, "num_tokens": 388843.0, "reward": 1.875, "reward_std": 0.6055297255516052, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 0.7916666865348816, "rewards/format_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001081228256226, "sampling/importance_sampling_ratio/min": 0.07036004215478897, "sampling/sampling_logp_difference/max": 2.65412974357605, "sampling/sampling_logp_difference/mean": 0.014815382659435272, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4871.0, "completions/mean_length": 2609.45849609375, "completions/mean_terminated_length": 2366.7392578125, "completions/min_length": 1340.0, "completions/min_terminated_length": 1340.0, "entropy": 0.2581055983901024, "epoch": 0.008264462809917356, "frac_reward_zero_std": 0.0, "grad_norm": 0.2529070676429162, "kl": 0.0005087396057206206, "learning_rate": 9.998829720549602e-07, "loss": 0.1667, "num_tokens": 468398.0, "reward": 2.1666667461395264, "reward_std": 0.5970090627670288, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997397065162659, "sampling/importance_sampling_ratio/min": 0.13575026392936707, "sampling/sampling_logp_difference/max": 1.9969384670257568, "sampling/sampling_logp_difference/mean": 0.012986009940505028, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2916666865348816, "completions/max_length": 8192.0, "completions/max_terminated_length": 7889.0, "completions/mean_length": 4482.95849609375, "completions/mean_terminated_length": 2955.705810546875, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "entropy": 0.3291335925459862, "epoch": 0.009641873278236915, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.15909916853519945, "kl": 0.000712596025550738, "learning_rate": 9.998314826517563e-07, "loss": 0.062, "num_tokens": 600053.0, "reward": 0.8333333730697632, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.125, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.7083333134651184, "rewards/format_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999268054962158, "sampling/importance_sampling_ratio/min": 0.1226503923535347, "sampling/sampling_logp_difference/max": 2.098417282104492, "sampling/sampling_logp_difference/mean": 0.01600910723209381, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4810.0, "completions/max_terminated_length": 4810.0, "completions/mean_length": 1951.5, "completions/mean_terminated_length": 1951.5, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.2874010130763054, "epoch": 0.011019283746556474, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.22479321113386752, "kl": 0.0006854519015178084, "learning_rate": 9.99770633817838e-07, "loss": 0.0803, "num_tokens": 666825.0, "reward": 2.0833334922790527, "reward_std": 0.4778915345668793, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00015127658844, "sampling/importance_sampling_ratio/min": 0.3547181487083435, "sampling/sampling_logp_difference/max": 1.0364317893981934, "sampling/sampling_logp_difference/mean": 0.012998003512620926, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7278.0, "completions/max_terminated_length": 7278.0, "completions/mean_length": 3300.791748046875, "completions/mean_terminated_length": 3300.791748046875, "completions/min_length": 1615.0, "completions/min_terminated_length": 1615.0, "entropy": 0.3161199018359184, "epoch": 0.012396694214876033, "frac_reward_zero_std": 0.0, "grad_norm": 0.22023288782689693, "kl": 0.0009353299537906423, "learning_rate": 9.997004266926104e-07, "loss": -0.063, "num_tokens": 769572.0, "reward": 1.7083333730697632, "reward_std": 0.5078567266464233, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000706911087036, "sampling/importance_sampling_ratio/min": 0.0014537055976688862, "sampling/sampling_logp_difference/max": 6.533639430999756, "sampling/sampling_logp_difference/mean": 0.015122607350349426, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6813.0, "completions/mean_length": 2792.58349609375, "completions/mean_terminated_length": 2557.826171875, "completions/min_length": 875.0, "completions/min_terminated_length": 875.0, "entropy": 0.28332703560590744, "epoch": 0.013774104683195593, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.16710769042355098, "kl": 0.0008509723120369017, "learning_rate": 9.99620862590714e-07, "loss": 0.1348, "num_tokens": 858722.0, "reward": 2.0, "reward_std": 0.39000558853149414, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000204086303711, "sampling/importance_sampling_ratio/min": 0.2697003185749054, "sampling/sampling_logp_difference/max": 1.3104438781738281, "sampling/sampling_logp_difference/mean": 0.012643387541174889, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7190.0, "completions/mean_length": 3368.791748046875, "completions/mean_terminated_length": 2679.761962890625, "completions/min_length": 785.0, "completions/min_terminated_length": 785.0, "entropy": 0.25256290286779404, "epoch": 0.015151515151515152, "frac_reward_zero_std": 0.0, "grad_norm": 0.17645257817462404, "kl": 0.0011589039204409346, "learning_rate": 9.995319430020003e-07, "loss": 0.1181, "num_tokens": 959845.0, "reward": 1.9583333730697632, "reward_std": 0.7435590624809265, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.875, "rewards/format_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000051259994507, "sampling/importance_sampling_ratio/min": 0.011985468678176403, "sampling/sampling_logp_difference/max": 4.424060344696045, "sampling/sampling_logp_difference/mean": 0.012076009064912796, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6058.0, "completions/max_terminated_length": 6058.0, "completions/mean_length": 2589.5, "completions/mean_terminated_length": 2589.5, "completions/min_length": 1266.0, "completions/min_terminated_length": 1266.0, "entropy": 0.2638929449021816, "epoch": 0.01652892561983471, "frac_reward_zero_std": 0.0, "grad_norm": 0.18010496944516938, "kl": 0.00098221683583688, "learning_rate": 9.99433669591504e-07, "loss": 0.1127, "num_tokens": 1042353.0, "reward": 1.5, "reward_std": 0.3900056481361389, "rewards/cloze_reward/mean": 0.125, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999534487724304, "sampling/importance_sampling_ratio/min": 0.36291250586509705, "sampling/sampling_logp_difference/max": 1.3159668445587158, "sampling/sampling_logp_difference/mean": 0.013833053410053253, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3234.0, "completions/max_terminated_length": 3234.0, "completions/mean_length": 2078.20849609375, "completions/mean_terminated_length": 2078.20849609375, "completions/min_length": 1470.0, "completions/min_terminated_length": 1470.0, "entropy": 0.18643449991941452, "epoch": 0.01790633608815427, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.23135066334203877, "kl": 0.0031637964711990207, "learning_rate": 9.993260441994114e-07, "loss": -0.002, "num_tokens": 1140390.0, "reward": 1.375, "reward_std": 0.5085179209709167, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.9693641662597656, "sampling/importance_sampling_ratio/mean": 0.9994663596153259, "sampling/importance_sampling_ratio/min": 0.0002035096986219287, "sampling/sampling_logp_difference/max": 8.499796867370605, "sampling/sampling_logp_difference/mean": 0.011178883723914623, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6206.0, "completions/max_terminated_length": 6206.0, "completions/mean_length": 3248.25, "completions/mean_terminated_length": 3248.25, "completions/min_length": 686.0, "completions/min_terminated_length": 686.0, "entropy": 0.28004511073231697, "epoch": 0.01928374655647383, "frac_reward_zero_std": 0.0, "grad_norm": 0.2111608534215747, "kl": 0.0016034767031669617, "learning_rate": 9.99209068841027e-07, "loss": -0.055, "num_tokens": 1235948.0, "reward": 2.125, "reward_std": 0.4082186818122864, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996271133422852, "sampling/importance_sampling_ratio/min": 0.16397786140441895, "sampling/sampling_logp_difference/max": 1.8080238103866577, "sampling/sampling_logp_difference/mean": 0.014200962148606777, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6290.0, "completions/max_terminated_length": 6290.0, "completions/mean_length": 2156.5, "completions/mean_terminated_length": 2156.5, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "entropy": 0.275097768753767, "epoch": 0.02066115702479339, "frac_reward_zero_std": 0.0, "grad_norm": 0.26723864416803433, "kl": 0.0022536640462931246, "learning_rate": 9.99082745706734e-07, "loss": 0.2325, "num_tokens": 1304496.0, "reward": 2.125, "reward_std": 0.503990888595581, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997144341468811, "sampling/importance_sampling_ratio/min": 0.01566636748611927, "sampling/sampling_logp_difference/max": 4.156239032745361, "sampling/sampling_logp_difference/mean": 0.013950522057712078, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5627.0, "completions/mean_length": 2773.95849609375, "completions/mean_terminated_length": 2281.4091796875, "completions/min_length": 1168.0, "completions/min_terminated_length": 1168.0, "entropy": 0.24988408759236336, "epoch": 0.02203856749311295, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.2002566600366291, "kl": 0.0017052267503459007, "learning_rate": 9.989470771619553e-07, "loss": 0.1211, "num_tokens": 1394159.0, "reward": 1.875, "reward_std": 0.5535047054290771, "rewards/cloze_reward/mean": 0.2083333283662796, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000037670135498, "sampling/importance_sampling_ratio/min": 0.08789712190628052, "sampling/sampling_logp_difference/max": 4.325733184814453, "sampling/sampling_logp_difference/mean": 0.012733320705592632, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7212.0, "completions/max_terminated_length": 7212.0, "completions/mean_length": 3347.291748046875, "completions/mean_terminated_length": 3347.291748046875, "completions/min_length": 829.0, "completions/min_terminated_length": 829.0, "entropy": 0.2636777497828007, "epoch": 0.023415977961432508, "frac_reward_zero_std": 0.0, "grad_norm": 0.1893310489949768, "kl": 0.0020791269780602306, "learning_rate": 9.988020657471077e-07, "loss": -0.1188, "num_tokens": 1504110.0, "reward": 1.625, "reward_std": 0.5480016469955444, "rewards/cloze_reward/mean": 0.125, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997584819793701, "sampling/importance_sampling_ratio/min": 0.021851731464266777, "sampling/sampling_logp_difference/max": 3.8234751224517822, "sampling/sampling_logp_difference/mean": 0.014004481956362724, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2283.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 1511.5, "completions/mean_terminated_length": 1511.5, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "entropy": 0.14751360192894936, "epoch": 0.024793388429752067, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.20794435707592182, "kl": 0.0025478473398834467, "learning_rate": 9.98647714177555e-07, "loss": -0.0004, "num_tokens": 1567602.0, "reward": 2.3333334922790527, "reward_std": 0.2357022613286972, "rewards/cloze_reward/mean": 0.375, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9995601177215576, "sampling/importance_sampling_ratio/min": 0.19188205897808075, "sampling/sampling_logp_difference/max": 1.650874376296997, "sampling/sampling_logp_difference/mean": 0.009327586740255356, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6061.0, "completions/max_terminated_length": 6061.0, "completions/mean_length": 2516.166748046875, "completions/mean_terminated_length": 2516.166748046875, "completions/min_length": 953.0, "completions/min_terminated_length": 953.0, "entropy": 0.27825580164790154, "epoch": 0.026170798898071626, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.16189513218269783, "kl": 0.0023431787849403918, "learning_rate": 9.984840253435568e-07, "loss": 0.0076, "num_tokens": 1653054.0, "reward": 2.125, "reward_std": 0.17251639068126678, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999192357063293, "sampling/importance_sampling_ratio/min": 0.06552005559206009, "sampling/sampling_logp_difference/max": 2.7253990173339844, "sampling/sampling_logp_difference/mean": 0.013308782130479813, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5089.0, "completions/max_terminated_length": 5089.0, "completions/mean_length": 1440.9583740234375, "completions/mean_terminated_length": 1440.9583740234375, "completions/min_length": 896.0, "completions/min_terminated_length": 896.0, "entropy": 0.2597235403954983, "epoch": 0.027548209366391185, "frac_reward_zero_std": 0.0, "grad_norm": 0.3058200238239626, "kl": 0.004811614111531526, "learning_rate": 9.983110023102145e-07, "loss": -0.0732, "num_tokens": 1709421.0, "reward": 2.0833334922790527, "reward_std": 0.6257078647613525, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002235174179077, "sampling/importance_sampling_ratio/min": 0.010991987772285938, "sampling/sampling_logp_difference/max": 4.510588645935059, "sampling/sampling_logp_difference/mean": 0.012450532987713814, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6266.0, "completions/mean_length": 2971.95849609375, "completions/mean_terminated_length": 2745.0, "completions/min_length": 725.0, "completions/min_terminated_length": 725.0, "entropy": 0.23062221333384514, "epoch": 0.028925619834710745, "frac_reward_zero_std": 0.0, "grad_norm": 0.20922776699151296, "kl": 0.002331310883164406, "learning_rate": 9.981286483174148e-07, "loss": 0.0302, "num_tokens": 1808892.0, "reward": 1.6666667461395264, "reward_std": 0.6947464346885681, "rewards/cloze_reward/mean": 0.1666666716337204, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999830424785614, "sampling/importance_sampling_ratio/min": 5.94212829982399e-13, "sampling/sampling_logp_difference/max": 28.151538848876953, "sampling/sampling_logp_difference/mean": 0.013190208002924919, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3810.0, "completions/max_terminated_length": 3810.0, "completions/mean_length": 2031.5, "completions/mean_terminated_length": 2031.5, "completions/min_length": 1207.0, "completions/min_terminated_length": 1207.0, "entropy": 0.1824016384780407, "epoch": 0.030303030303030304, "frac_reward_zero_std": 0.0, "grad_norm": 0.2952973607886351, "kl": 0.004044964851345867, "learning_rate": 9.979369667797674e-07, "loss": 0.0554, "num_tokens": 1911664.0, "reward": 1.5, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.0, "rewards/cloze_reward/std": 0.0, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998181462287903, "sampling/importance_sampling_ratio/min": 0.0026749928947538137, "sampling/sampling_logp_difference/max": 5.923808574676514, "sampling/sampling_logp_difference/mean": 0.011196551844477654, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7742.0, "completions/max_terminated_length": 7742.0, "completions/mean_length": 2664.791748046875, "completions/mean_terminated_length": 2664.791748046875, "completions/min_length": 716.0, "completions/min_terminated_length": 716.0, "entropy": 0.16602139174938202, "epoch": 0.03168044077134986, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.18646426939832977, "kl": 0.0027883301954716444, "learning_rate": 9.977359612865422e-07, "loss": -0.1659, "num_tokens": 2005483.0, "reward": 2.0, "reward_std": 0.34503278136253357, "rewards/cloze_reward/mean": 0.5416666865348816, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000344514846802, "sampling/importance_sampling_ratio/min": 2.7512096333753844e-12, "sampling/sampling_logp_difference/max": 26.618980407714844, "sampling/sampling_logp_difference/mean": 0.010194670408964157, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7718.0, "completions/mean_length": 4289.5, "completions/mean_terminated_length": 4119.826171875, "completions/min_length": 979.0, "completions/min_terminated_length": 979.0, "entropy": 0.2995450422167778, "epoch": 0.03305785123966942, "frac_reward_zero_std": 0.0, "grad_norm": 0.170700316120424, "kl": 0.0018004105950240046, "learning_rate": 9.975256356016017e-07, "loss": 0.0699, "num_tokens": 2128183.0, "reward": 2.0, "reward_std": 0.6288648843765259, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999317526817322, "sampling/importance_sampling_ratio/min": 0.08646544814109802, "sampling/sampling_logp_difference/max": 2.4480104446411133, "sampling/sampling_logp_difference/mean": 0.014326645992696285, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7758.0, "completions/max_terminated_length": 7758.0, "completions/mean_length": 3043.45849609375, "completions/mean_terminated_length": 3043.45849609375, "completions/min_length": 1282.0, "completions/min_terminated_length": 1282.0, "entropy": 0.2883247286081314, "epoch": 0.03443526170798898, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.12988436998967914, "kl": 0.0025270123151130974, "learning_rate": 9.973059936633306e-07, "loss": 0.0545, "num_tokens": 2218418.0, "reward": 2.2916667461395264, "reward_std": 0.21362332999706268, "rewards/cloze_reward/mean": 0.75, "rewards/cloze_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999663829803467, "sampling/importance_sampling_ratio/min": 0.10468851774930954, "sampling/sampling_logp_difference/max": 2.256765842437744, "sampling/sampling_logp_difference/mean": 0.012674104422330856, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4710.0, "completions/mean_length": 2069.625, "completions/mean_terminated_length": 1803.434814453125, "completions/min_length": 935.0, "completions/min_terminated_length": 935.0, "entropy": 0.1856878437101841, "epoch": 0.03581267217630854, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.2189265719041607, "kl": 0.0038806990487501025, "learning_rate": 9.970770395845622e-07, "loss": 0.3118, "num_tokens": 2309321.0, "reward": 2.375, "reward_std": 0.5061727166175842, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998181462287903, "sampling/importance_sampling_ratio/min": 0.23940998315811157, "sampling/sampling_logp_difference/max": 1.4295778274536133, "sampling/sampling_logp_difference/mean": 0.009506626054644585, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8177.0, "completions/mean_length": 4914.20849609375, "completions/mean_terminated_length": 4771.69580078125, "completions/min_length": 1792.0, "completions/min_terminated_length": 1792.0, "entropy": 0.3651472330093384, "epoch": 0.0371900826446281, "frac_reward_zero_std": 0.0, "grad_norm": 0.16794783472528907, "kl": 0.0032920141238719225, "learning_rate": 9.968387776525007e-07, "loss": 0.0044, "num_tokens": 2447670.0, "reward": 1.7083333730697632, "reward_std": 0.6274997591972351, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998521208763123, "sampling/importance_sampling_ratio/min": 0.00022299536794889718, "sampling/sampling_logp_difference/max": 8.40835952758789, "sampling/sampling_logp_difference/mean": 0.017919987440109253, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7702.0, "completions/max_terminated_length": 7702.0, "completions/mean_length": 3003.375, "completions/mean_terminated_length": 3003.375, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.30894363671541214, "epoch": 0.03856749311294766, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.2662075831826717, "kl": 0.003954145242460072, "learning_rate": 9.965912123286424e-07, "loss": 0.085, "num_tokens": 2537663.0, "reward": 2.5416667461395264, "reward_std": 0.4082186222076416, "rewards/cloze_reward/mean": 0.7083333134651184, "rewards/cloze_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000332593917847, "sampling/importance_sampling_ratio/min": 0.21365012228488922, "sampling/sampling_logp_difference/max": 2.255774974822998, "sampling/sampling_logp_difference/mean": 0.014759156852960587, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7521.0, "completions/mean_length": 2249.291748046875, "completions/mean_terminated_length": 1990.9130859375, "completions/min_length": 932.0, "completions/min_terminated_length": 932.0, "entropy": 0.20293106883764267, "epoch": 0.03994490358126722, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.30621070115238647, "kl": 0.004532316525001079, "learning_rate": 9.963343482486905e-07, "loss": 0.1541, "num_tokens": 2608934.0, "reward": 2.4166667461395264, "reward_std": 0.5028601884841919, "rewards/cloze_reward/mean": 0.5833333134651184, "rewards/cloze_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 0.9583333134651184, "rewards/format_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000059962272644, "sampling/importance_sampling_ratio/min": 0.2822778820991516, "sampling/sampling_logp_difference/max": 1.3750309944152832, "sampling/sampling_logp_difference/mean": 0.011287961155176163, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6912.0, "completions/max_terminated_length": 6912.0, "completions/mean_length": 2896.416748046875, "completions/mean_terminated_length": 2896.416748046875, "completions/min_length": 1072.0, "completions/min_terminated_length": 1072.0, "entropy": 0.24397730827331543, "epoch": 0.04132231404958678, "frac_reward_zero_std": 0.0, "grad_norm": 0.2637496585585402, "kl": 0.0052707926370203495, "learning_rate": 9.96068190222469e-07, "loss": -0.0809, "num_tokens": 2696368.0, "reward": 2.3333334922790527, "reward_std": 0.7106520533561707, "rewards/cloze_reward/mean": 0.7916666865348816, "rewards/cloze_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000116229057312, "sampling/importance_sampling_ratio/min": 0.26414525508880615, "sampling/sampling_logp_difference/max": 1.3681449890136719, "sampling/sampling_logp_difference/mean": 0.012104719877243042, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5352.0, "completions/max_terminated_length": 5352.0, "completions/mean_length": 1967.75, "completions/mean_terminated_length": 1967.75, "completions/min_length": 804.0, "completions/min_terminated_length": 804.0, "entropy": 0.21574077382683754, "epoch": 0.04269972451790634, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.12651354951166632, "kl": 0.0038389787659980357, "learning_rate": 9.95792743233833e-07, "loss": 0.1096, "num_tokens": 2766314.0, "reward": 2.4166667461395264, "reward_std": 0.15430335700511932, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9996727108955383, "sampling/importance_sampling_ratio/min": 0.11743912100791931, "sampling/sampling_logp_difference/max": 2.1418352127075195, "sampling/sampling_logp_difference/mean": 0.011217731051146984, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6975.0, "completions/max_terminated_length": 6975.0, "completions/mean_length": 2998.25, "completions/mean_terminated_length": 2998.25, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "entropy": 0.34803634136915207, "epoch": 0.0440771349862259, "frac_reward_zero_std": 0.0, "grad_norm": 0.18832712543023003, "kl": 0.003418200241867453, "learning_rate": 9.95508012440575e-07, "loss": 0.0195, "num_tokens": 2858208.0, "reward": 2.25, "reward_std": 0.5748276710510254, "rewards/cloze_reward/mean": 0.625, "rewards/cloze_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001078844070435, "sampling/importance_sampling_ratio/min": 0.33018070459365845, "sampling/sampling_logp_difference/max": 1.1081151962280273, "sampling/sampling_logp_difference/mean": 0.01556301862001419, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4436.0, "completions/max_terminated_length": 4436.0, "completions/mean_length": 2222.33349609375, "completions/mean_terminated_length": 2222.33349609375, "completions/min_length": 961.0, "completions/min_terminated_length": 961.0, "entropy": 0.2771223820745945, "epoch": 0.045454545454545456, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.17495461933111076, "kl": 0.0048337242333218455, "learning_rate": 9.95214003174328e-07, "loss": -0.0133, "num_tokens": 2930472.0, "reward": 2.5, "reward_std": 0.36585909128189087, "rewards/cloze_reward/mean": 0.6666666865348816, "rewards/cloze_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997199177742004, "sampling/importance_sampling_ratio/min": 0.17066244781017303, "sampling/sampling_logp_difference/max": 2.808061122894287, "sampling/sampling_logp_difference/mean": 0.012434008531272411, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6776.0, "completions/max_terminated_length": 6776.0, "completions/mean_length": 3140.70849609375, "completions/mean_terminated_length": 3140.70849609375, "completions/min_length": 726.0, "completions/min_terminated_length": 726.0, "entropy": 0.28713061660528183, "epoch": 0.046831955922865015, "frac_reward_zero_std": 0.0, "grad_norm": 0.5763177612619246, "kl": 0.005852235772181302, "learning_rate": 9.949107209404663e-07, "loss": 0.0443, "num_tokens": 3043473.0, "reward": 2.0416667461395264, "reward_std": 0.5222300291061401, "rewards/cloze_reward/mean": 0.5, "rewards/cloze_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002092123031616, "sampling/importance_sampling_ratio/min": 0.010155132971704006, "sampling/sampling_logp_difference/max": 4.589776039123535, "sampling/sampling_logp_difference/mean": 0.012913873419165611, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7808.0, "completions/mean_length": 2782.75, "completions/mean_terminated_length": 2291.0, "completions/min_length": 736.0, "completions/min_terminated_length": 736.0, "entropy": 0.23918093740940094, "epoch": 0.048209366391184574, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.19663548974567993, "kl": 0.004491619183681905, "learning_rate": 9.94598171418002e-07, "loss": 0.1035, "num_tokens": 3126827.0, "reward": 2.1666667461395264, "reward_std": 0.35634833574295044, "rewards/cloze_reward/mean": 0.4166666567325592, "rewards/cloze_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "rewards/format_reward/mean": 0.9166666865348816, "rewards/format_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.8434656858444214, "sampling/importance_sampling_ratio/mean": 1.0001463890075684, "sampling/importance_sampling_ratio/min": 0.14892546832561493, "sampling/sampling_logp_difference/max": 1.9043092727661133, "sampling/sampling_logp_difference/mean": 0.011121454648673534, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6397.0, "completions/max_terminated_length": 6397.0, "completions/mean_length": 2130.70849609375, "completions/mean_terminated_length": 2130.70849609375, "completions/min_length": 908.0, "completions/min_terminated_length": 908.0, "entropy": 0.25559694319963455, "epoch": 0.049586776859504134, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.20044650714243367, "kl": 0.004921266925521195, "learning_rate": 9.94276360459479e-07, "loss": 0.0668, "num_tokens": 3204780.0, "reward": 2.0, "reward_std": 0.34503278136253357, "rewards/cloze_reward/mean": 0.125, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998623728752136, "sampling/importance_sampling_ratio/min": 0.17689232528209686, "sampling/sampling_logp_difference/max": 1.732214093208313, "sampling/sampling_logp_difference/mean": 0.012361792847514153, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4222.0, "completions/max_terminated_length": 4222.0, "completions/mean_length": 1950.666748046875, "completions/mean_terminated_length": 1950.666748046875, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.2424805425107479, "epoch": 0.05096418732782369, "frac_reward_zero_std": 0.0, "grad_norm": 0.2413970223564091, "kl": 0.004224332165904343, "learning_rate": 9.939452940908626e-07, "loss": 0.0541, "num_tokens": 3282572.0, "reward": 1.8333333730697632, "reward_std": 0.48678088188171387, "rewards/cloze_reward/mean": 0.0833333358168602, "rewards/cloze_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999829113483429, "sampling/importance_sampling_ratio/min": 0.16516301035881042, "sampling/sampling_logp_difference/max": 1.800822377204895, "sampling/sampling_logp_difference/mean": 0.01168552041053772, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6989.0, "completions/max_terminated_length": 6989.0, "completions/mean_length": 2920.041748046875, "completions/mean_terminated_length": 2920.041748046875, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "entropy": 0.2422230765223503, "epoch": 0.05234159779614325, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.16283138475922154, "kl": 0.004039893392473459, "learning_rate": 9.936049785114279e-07, "loss": 0.0391, "num_tokens": 3380365.0, "reward": 1.8333333730697632, "reward_std": 0.43015047907829285, "rewards/cloze_reward/mean": 0.125, "rewards/cloze_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998534321784973, "sampling/importance_sampling_ratio/min": 0.11749733984470367, "sampling/sampling_logp_difference/max": 2.1413395404815674, "sampling/sampling_logp_difference/mean": 0.012631865218281746, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7304.0, "completions/max_terminated_length": 7304.0, "completions/mean_length": 3322.041748046875, "completions/mean_terminated_length": 3322.041748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4101082310080528, "epoch": 0.05371900826446281, "frac_reward_zero_std": 0.0, "grad_norm": 0.181795370019662, "kl": 0.0036983516183681786, "learning_rate": 9.932554200936426e-07, "loss": -0.1581, "num_tokens": 3488278.0, "reward": 1.2083333730697632, "reward_std": 0.6043562889099121, "rewards/cloze_reward/mean": 0.1666666716337204, "rewards/cloze_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "rewards/format_reward/mean": 0.6666666865348816, "rewards/format_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999675154685974, "sampling/importance_sampling_ratio/min": 0.04754088446497917, "sampling/sampling_logp_difference/max": 3.0461652278900146, "sampling/sampling_logp_difference/mean": 0.01610734686255455, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5011.0, "completions/max_terminated_length": 5011.0, "completions/mean_length": 2942.58349609375, "completions/mean_terminated_length": 2942.58349609375, "completions/min_length": 1454.0, "completions/min_terminated_length": 1454.0, "entropy": 0.35880884528160095, "epoch": 0.05509641873278237, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.17000406623354494, "kl": 0.003319010080303997, "learning_rate": 9.92896625383049e-07, "loss": -0.0559, "num_tokens": 3575956.0, "reward": 1.875, "reward_std": 0.4023112952709198, "rewards/cloze_reward/mean": 0.4583333432674408, "rewards/cloze_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000556707382202, "sampling/importance_sampling_ratio/min": 0.27387282252311707, "sampling/sampling_logp_difference/max": 1.2950913906097412, "sampling/sampling_logp_difference/mean": 0.01579936034977436, "step": 40 } ], "logging_steps": 1.0, "max_steps": 726, "num_input_tokens_seen": 3575956, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }