| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.05509641873278237, | |
| "eval_steps": 500, | |
| "global_step": 40, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2915.0, | |
| "completions/max_terminated_length": 2915.0, | |
| "completions/mean_length": 1900.916748046875, | |
| "completions/mean_terminated_length": 1900.916748046875, | |
| "completions/min_length": 1153.0, | |
| "completions/min_terminated_length": 1153.0, | |
| "entropy": 0.29732953757047653, | |
| "epoch": 0.0013774104683195593, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2848003042646487, | |
| "kl": 0.0, | |
| "learning_rate": 1e-06, | |
| "loss": 0.065, | |
| "num_tokens": 62470.0, | |
| "reward": 1.875, | |
| "reward_std": 0.7653362154960632, | |
| "rewards/cloze_reward/mean": 0.25, | |
| "rewards/cloze_reward/std": 0.4423258602619171, | |
| "rewards/code_reward/mean": 0.9166666865348816, | |
| "rewards/code_reward/std": 0.28232985734939575, | |
| "rewards/format_reward/mean": 0.7083333134651184, | |
| "rewards/format_reward/std": 0.4643056094646454, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0003361701965332, | |
| "sampling/importance_sampling_ratio/min": 0.12010933458805084, | |
| "sampling/sampling_logp_difference/max": 2.1193528175354004, | |
| "sampling/sampling_logp_difference/mean": 0.013728929683566093, | |
| "step": 1 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4209.0, | |
| "completions/max_terminated_length": 4209.0, | |
| "completions/mean_length": 1807.125, | |
| "completions/mean_terminated_length": 1807.125, | |
| "completions/min_length": 3.0, | |
| "completions/min_terminated_length": 3.0, | |
| "entropy": 0.4276350364089012, | |
| "epoch": 0.0027548209366391185, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.27170591036025904, | |
| "kl": 0.0017779993431759067, | |
| "learning_rate": 9.999953187068845e-07, | |
| "loss": -0.0052, | |
| "num_tokens": 144433.0, | |
| "reward": 1.375, | |
| "reward_std": 0.697779655456543, | |
| "rewards/cloze_reward/mean": 0.4166666567325592, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 0.4166666567325592, | |
| "rewards/code_reward/std": 0.5036101937294006, | |
| "rewards/format_reward/mean": 0.5416666865348816, | |
| "rewards/format_reward/std": 0.5089773535728455, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000282526016235, | |
| "sampling/importance_sampling_ratio/min": 0.054640088230371475, | |
| "sampling/sampling_logp_difference/max": 2.906987428665161, | |
| "sampling/sampling_logp_difference/mean": 0.014421815052628517, | |
| "step": 2 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5955.0, | |
| "completions/max_terminated_length": 5955.0, | |
| "completions/mean_length": 1929.8333740234375, | |
| "completions/mean_terminated_length": 1929.8333740234375, | |
| "completions/min_length": 30.0, | |
| "completions/min_terminated_length": 30.0, | |
| "entropy": 0.15682650730013847, | |
| "epoch": 0.004132231404958678, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.23379512087809812, | |
| "kl": 0.00041251888615079224, | |
| "learning_rate": 9.999812749151967e-07, | |
| "loss": 0.0655, | |
| "num_tokens": 208821.0, | |
| "reward": 2.375, | |
| "reward_std": 0.9505624771118164, | |
| "rewards/cloze_reward/mean": 0.7916666865348816, | |
| "rewards/cloze_reward/std": 0.4148511290550232, | |
| "rewards/code_reward/mean": 0.7916666865348816, | |
| "rewards/code_reward/std": 0.4148510992527008, | |
| "rewards/format_reward/mean": 0.7916666865348816, | |
| "rewards/format_reward/std": 0.4148511290550232, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0001002550125122, | |
| "sampling/importance_sampling_ratio/min": 0.04702043533325195, | |
| "sampling/sampling_logp_difference/max": 3.057173013687134, | |
| "sampling/sampling_logp_difference/mean": 0.008513483218848705, | |
| "step": 3 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6342.0, | |
| "completions/max_terminated_length": 6342.0, | |
| "completions/mean_length": 3278.5, | |
| "completions/mean_terminated_length": 3278.5, | |
| "completions/min_length": 990.0, | |
| "completions/min_terminated_length": 990.0, | |
| "entropy": 0.23897437751293182, | |
| "epoch": 0.005509641873278237, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16457158766091876, | |
| "kl": 0.0004863361318712123, | |
| "learning_rate": 9.999578688879084e-07, | |
| "loss": 0.0713, | |
| "num_tokens": 306505.0, | |
| "reward": 2.1666667461395264, | |
| "reward_std": 0.6947464346885681, | |
| "rewards/cloze_reward/mean": 0.4583333432674408, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.7916666865348816, | |
| "rewards/code_reward/std": 0.4148511290550232, | |
| "rewards/format_reward/mean": 0.9166666865348816, | |
| "rewards/format_reward/std": 0.28232985734939575, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0001260042190552, | |
| "sampling/importance_sampling_ratio/min": 0.1858593225479126, | |
| "sampling/sampling_logp_difference/max": 1.682765245437622, | |
| "sampling/sampling_logp_difference/mean": 0.012356276623904705, | |
| "step": 4 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0833333358168602, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 5875.0, | |
| "completions/mean_length": 2768.08349609375, | |
| "completions/mean_terminated_length": 2275.0, | |
| "completions/min_length": 772.0, | |
| "completions/min_terminated_length": 772.0, | |
| "entropy": 0.30327995866537094, | |
| "epoch": 0.006887052341597796, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.24551726103614221, | |
| "kl": 0.000520530593348667, | |
| "learning_rate": 9.999251010633018e-07, | |
| "loss": 0.1599, | |
| "num_tokens": 388843.0, | |
| "reward": 1.875, | |
| "reward_std": 0.6055297255516052, | |
| "rewards/cloze_reward/mean": 0.5833333134651184, | |
| "rewards/cloze_reward/std": 0.5036101341247559, | |
| "rewards/code_reward/mean": 0.5, | |
| "rewards/code_reward/std": 0.5107539296150208, | |
| "rewards/format_reward/mean": 0.7916666865348816, | |
| "rewards/format_reward/std": 0.4148510992527008, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0001081228256226, | |
| "sampling/importance_sampling_ratio/min": 0.07036004215478897, | |
| "sampling/sampling_logp_difference/max": 2.65412974357605, | |
| "sampling/sampling_logp_difference/mean": 0.014815382659435272, | |
| "step": 5 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 4871.0, | |
| "completions/mean_length": 2609.45849609375, | |
| "completions/mean_terminated_length": 2366.7392578125, | |
| "completions/min_length": 1340.0, | |
| "completions/min_terminated_length": 1340.0, | |
| "entropy": 0.2581055983901024, | |
| "epoch": 0.008264462809917356, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2529070676429162, | |
| "kl": 0.0005087396057206206, | |
| "learning_rate": 9.998829720549602e-07, | |
| "loss": 0.1667, | |
| "num_tokens": 468398.0, | |
| "reward": 2.1666667461395264, | |
| "reward_std": 0.5970090627670288, | |
| "rewards/cloze_reward/mean": 0.4583333432674408, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.75, | |
| "rewards/code_reward/std": 0.4423258602619171, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9997397065162659, | |
| "sampling/importance_sampling_ratio/min": 0.13575026392936707, | |
| "sampling/sampling_logp_difference/max": 1.9969384670257568, | |
| "sampling/sampling_logp_difference/mean": 0.012986009940505028, | |
| "step": 6 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.2916666865348816, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 7889.0, | |
| "completions/mean_length": 4482.95849609375, | |
| "completions/mean_terminated_length": 2955.705810546875, | |
| "completions/min_length": 809.0, | |
| "completions/min_terminated_length": 809.0, | |
| "entropy": 0.3291335925459862, | |
| "epoch": 0.009641873278236915, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.15909916853519945, | |
| "kl": 0.000712596025550738, | |
| "learning_rate": 9.998314826517563e-07, | |
| "loss": 0.062, | |
| "num_tokens": 600053.0, | |
| "reward": 0.8333333730697632, | |
| "reward_std": 0.48678088188171387, | |
| "rewards/cloze_reward/mean": 0.0, | |
| "rewards/cloze_reward/std": 0.0, | |
| "rewards/code_reward/mean": 0.125, | |
| "rewards/code_reward/std": 0.337831974029541, | |
| "rewards/format_reward/mean": 0.7083333134651184, | |
| "rewards/format_reward/std": 0.4643056094646454, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999268054962158, | |
| "sampling/importance_sampling_ratio/min": 0.1226503923535347, | |
| "sampling/sampling_logp_difference/max": 2.098417282104492, | |
| "sampling/sampling_logp_difference/mean": 0.01600910723209381, | |
| "step": 7 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4810.0, | |
| "completions/max_terminated_length": 4810.0, | |
| "completions/mean_length": 1951.5, | |
| "completions/mean_terminated_length": 1951.5, | |
| "completions/min_length": 779.0, | |
| "completions/min_terminated_length": 779.0, | |
| "entropy": 0.2874010130763054, | |
| "epoch": 0.011019283746556474, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.22479321113386752, | |
| "kl": 0.0006854519015178084, | |
| "learning_rate": 9.99770633817838e-07, | |
| "loss": 0.0803, | |
| "num_tokens": 666825.0, | |
| "reward": 2.0833334922790527, | |
| "reward_std": 0.4778915345668793, | |
| "rewards/cloze_reward/mean": 0.4166666567325592, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 0.6666666865348816, | |
| "rewards/code_reward/std": 0.4815434217453003, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.00015127658844, | |
| "sampling/importance_sampling_ratio/min": 0.3547181487083435, | |
| "sampling/sampling_logp_difference/max": 1.0364317893981934, | |
| "sampling/sampling_logp_difference/mean": 0.012998003512620926, | |
| "step": 8 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7278.0, | |
| "completions/max_terminated_length": 7278.0, | |
| "completions/mean_length": 3300.791748046875, | |
| "completions/mean_terminated_length": 3300.791748046875, | |
| "completions/min_length": 1615.0, | |
| "completions/min_terminated_length": 1615.0, | |
| "entropy": 0.3161199018359184, | |
| "epoch": 0.012396694214876033, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.22023288782689693, | |
| "kl": 0.0009353299537906423, | |
| "learning_rate": 9.997004266926104e-07, | |
| "loss": -0.063, | |
| "num_tokens": 769572.0, | |
| "reward": 1.7083333730697632, | |
| "reward_std": 0.5078567266464233, | |
| "rewards/cloze_reward/mean": 0.375, | |
| "rewards/cloze_reward/std": 0.494535356760025, | |
| "rewards/code_reward/mean": 0.3333333432674408, | |
| "rewards/code_reward/std": 0.4815433919429779, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000706911087036, | |
| "sampling/importance_sampling_ratio/min": 0.0014537055976688862, | |
| "sampling/sampling_logp_difference/max": 6.533639430999756, | |
| "sampling/sampling_logp_difference/mean": 0.015122607350349426, | |
| "step": 9 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 6813.0, | |
| "completions/mean_length": 2792.58349609375, | |
| "completions/mean_terminated_length": 2557.826171875, | |
| "completions/min_length": 875.0, | |
| "completions/min_terminated_length": 875.0, | |
| "entropy": 0.28332703560590744, | |
| "epoch": 0.013774104683195593, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.16710769042355098, | |
| "kl": 0.0008509723120369017, | |
| "learning_rate": 9.99620862590714e-07, | |
| "loss": 0.1348, | |
| "num_tokens": 858722.0, | |
| "reward": 2.0, | |
| "reward_std": 0.39000558853149414, | |
| "rewards/cloze_reward/mean": 0.375, | |
| "rewards/cloze_reward/std": 0.494535356760025, | |
| "rewards/code_reward/mean": 0.7083333134651184, | |
| "rewards/code_reward/std": 0.4643056094646454, | |
| "rewards/format_reward/mean": 0.9166666865348816, | |
| "rewards/format_reward/std": 0.28232985734939575, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.000204086303711, | |
| "sampling/importance_sampling_ratio/min": 0.2697003185749054, | |
| "sampling/sampling_logp_difference/max": 1.3104438781738281, | |
| "sampling/sampling_logp_difference/mean": 0.012643387541174889, | |
| "step": 10 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 7190.0, | |
| "completions/mean_length": 3368.791748046875, | |
| "completions/mean_terminated_length": 2679.761962890625, | |
| "completions/min_length": 785.0, | |
| "completions/min_terminated_length": 785.0, | |
| "entropy": 0.25256290286779404, | |
| "epoch": 0.015151515151515152, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.17645257817462404, | |
| "kl": 0.0011589039204409346, | |
| "learning_rate": 9.995319430020003e-07, | |
| "loss": 0.1181, | |
| "num_tokens": 959845.0, | |
| "reward": 1.9583333730697632, | |
| "reward_std": 0.7435590624809265, | |
| "rewards/cloze_reward/mean": 0.5416666865348816, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 0.875, | |
| "rewards/format_reward/std": 0.337831974029541, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000051259994507, | |
| "sampling/importance_sampling_ratio/min": 0.011985468678176403, | |
| "sampling/sampling_logp_difference/max": 4.424060344696045, | |
| "sampling/sampling_logp_difference/mean": 0.012076009064912796, | |
| "step": 11 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6058.0, | |
| "completions/max_terminated_length": 6058.0, | |
| "completions/mean_length": 2589.5, | |
| "completions/mean_terminated_length": 2589.5, | |
| "completions/min_length": 1266.0, | |
| "completions/min_terminated_length": 1266.0, | |
| "entropy": 0.2638929449021816, | |
| "epoch": 0.01652892561983471, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18010496944516938, | |
| "kl": 0.00098221683583688, | |
| "learning_rate": 9.99433669591504e-07, | |
| "loss": 0.1127, | |
| "num_tokens": 1042353.0, | |
| "reward": 1.5, | |
| "reward_std": 0.3900056481361389, | |
| "rewards/cloze_reward/mean": 0.125, | |
| "rewards/cloze_reward/std": 0.337831974029541, | |
| "rewards/code_reward/mean": 0.375, | |
| "rewards/code_reward/std": 0.494535356760025, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999534487724304, | |
| "sampling/importance_sampling_ratio/min": 0.36291250586509705, | |
| "sampling/sampling_logp_difference/max": 1.3159668445587158, | |
| "sampling/sampling_logp_difference/mean": 0.013833053410053253, | |
| "step": 12 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3234.0, | |
| "completions/max_terminated_length": 3234.0, | |
| "completions/mean_length": 2078.20849609375, | |
| "completions/mean_terminated_length": 2078.20849609375, | |
| "completions/min_length": 1470.0, | |
| "completions/min_terminated_length": 1470.0, | |
| "entropy": 0.18643449991941452, | |
| "epoch": 0.01790633608815427, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.23135066334203877, | |
| "kl": 0.0031637964711990207, | |
| "learning_rate": 9.993260441994114e-07, | |
| "loss": -0.002, | |
| "num_tokens": 1140390.0, | |
| "reward": 1.375, | |
| "reward_std": 0.5085179209709167, | |
| "rewards/cloze_reward/mean": 0.2083333283662796, | |
| "rewards/cloze_reward/std": 0.4148511290550232, | |
| "rewards/code_reward/mean": 0.2083333283662796, | |
| "rewards/code_reward/std": 0.4148511290550232, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 1.9693641662597656, | |
| "sampling/importance_sampling_ratio/mean": 0.9994663596153259, | |
| "sampling/importance_sampling_ratio/min": 0.0002035096986219287, | |
| "sampling/sampling_logp_difference/max": 8.499796867370605, | |
| "sampling/sampling_logp_difference/mean": 0.011178883723914623, | |
| "step": 13 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6206.0, | |
| "completions/max_terminated_length": 6206.0, | |
| "completions/mean_length": 3248.25, | |
| "completions/mean_terminated_length": 3248.25, | |
| "completions/min_length": 686.0, | |
| "completions/min_terminated_length": 686.0, | |
| "entropy": 0.28004511073231697, | |
| "epoch": 0.01928374655647383, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2111608534215747, | |
| "kl": 0.0016034767031669617, | |
| "learning_rate": 9.99209068841027e-07, | |
| "loss": -0.055, | |
| "num_tokens": 1235948.0, | |
| "reward": 2.125, | |
| "reward_std": 0.4082186818122864, | |
| "rewards/cloze_reward/mean": 0.625, | |
| "rewards/cloze_reward/std": 0.494535356760025, | |
| "rewards/code_reward/mean": 0.5, | |
| "rewards/code_reward/std": 0.5107539296150208, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9996271133422852, | |
| "sampling/importance_sampling_ratio/min": 0.16397786140441895, | |
| "sampling/sampling_logp_difference/max": 1.8080238103866577, | |
| "sampling/sampling_logp_difference/mean": 0.014200962148606777, | |
| "step": 14 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6290.0, | |
| "completions/max_terminated_length": 6290.0, | |
| "completions/mean_length": 2156.5, | |
| "completions/mean_terminated_length": 2156.5, | |
| "completions/min_length": 851.0, | |
| "completions/min_terminated_length": 851.0, | |
| "entropy": 0.275097768753767, | |
| "epoch": 0.02066115702479339, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.26723864416803433, | |
| "kl": 0.0022536640462931246, | |
| "learning_rate": 9.99082745706734e-07, | |
| "loss": 0.2325, | |
| "num_tokens": 1304496.0, | |
| "reward": 2.125, | |
| "reward_std": 0.503990888595581, | |
| "rewards/cloze_reward/mean": 0.4166666567325592, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 0.7083333134651184, | |
| "rewards/code_reward/std": 0.4643056094646454, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9997144341468811, | |
| "sampling/importance_sampling_ratio/min": 0.01566636748611927, | |
| "sampling/sampling_logp_difference/max": 4.156239032745361, | |
| "sampling/sampling_logp_difference/mean": 0.013950522057712078, | |
| "step": 15 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0833333358168602, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 5627.0, | |
| "completions/mean_length": 2773.95849609375, | |
| "completions/mean_terminated_length": 2281.4091796875, | |
| "completions/min_length": 1168.0, | |
| "completions/min_terminated_length": 1168.0, | |
| "entropy": 0.24988408759236336, | |
| "epoch": 0.02203856749311295, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.2002566600366291, | |
| "kl": 0.0017052267503459007, | |
| "learning_rate": 9.989470771619553e-07, | |
| "loss": 0.1211, | |
| "num_tokens": 1394159.0, | |
| "reward": 1.875, | |
| "reward_std": 0.5535047054290771, | |
| "rewards/cloze_reward/mean": 0.2083333283662796, | |
| "rewards/cloze_reward/std": 0.4148511290550232, | |
| "rewards/code_reward/mean": 0.75, | |
| "rewards/code_reward/std": 0.4423258602619171, | |
| "rewards/format_reward/mean": 0.9166666865348816, | |
| "rewards/format_reward/std": 0.28232985734939575, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.000037670135498, | |
| "sampling/importance_sampling_ratio/min": 0.08789712190628052, | |
| "sampling/sampling_logp_difference/max": 4.325733184814453, | |
| "sampling/sampling_logp_difference/mean": 0.012733320705592632, | |
| "step": 16 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7212.0, | |
| "completions/max_terminated_length": 7212.0, | |
| "completions/mean_length": 3347.291748046875, | |
| "completions/mean_terminated_length": 3347.291748046875, | |
| "completions/min_length": 829.0, | |
| "completions/min_terminated_length": 829.0, | |
| "entropy": 0.2636777497828007, | |
| "epoch": 0.023415977961432508, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.1893310489949768, | |
| "kl": 0.0020791269780602306, | |
| "learning_rate": 9.988020657471077e-07, | |
| "loss": -0.1188, | |
| "num_tokens": 1504110.0, | |
| "reward": 1.625, | |
| "reward_std": 0.5480016469955444, | |
| "rewards/cloze_reward/mean": 0.125, | |
| "rewards/cloze_reward/std": 0.337831974029541, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9997584819793701, | |
| "sampling/importance_sampling_ratio/min": 0.021851731464266777, | |
| "sampling/sampling_logp_difference/max": 3.8234751224517822, | |
| "sampling/sampling_logp_difference/mean": 0.014004481956362724, | |
| "step": 17 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2283.0, | |
| "completions/max_terminated_length": 2283.0, | |
| "completions/mean_length": 1511.5, | |
| "completions/mean_terminated_length": 1511.5, | |
| "completions/min_length": 805.0, | |
| "completions/min_terminated_length": 805.0, | |
| "entropy": 0.14751360192894936, | |
| "epoch": 0.024793388429752067, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.20794435707592182, | |
| "kl": 0.0025478473398834467, | |
| "learning_rate": 9.98647714177555e-07, | |
| "loss": -0.0004, | |
| "num_tokens": 1567602.0, | |
| "reward": 2.3333334922790527, | |
| "reward_std": 0.2357022613286972, | |
| "rewards/cloze_reward/mean": 0.375, | |
| "rewards/cloze_reward/std": 0.494535356760025, | |
| "rewards/code_reward/mean": 0.9583333134651184, | |
| "rewards/code_reward/std": 0.20412415266036987, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9995601177215576, | |
| "sampling/importance_sampling_ratio/min": 0.19188205897808075, | |
| "sampling/sampling_logp_difference/max": 1.650874376296997, | |
| "sampling/sampling_logp_difference/mean": 0.009327586740255356, | |
| "step": 18 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6061.0, | |
| "completions/max_terminated_length": 6061.0, | |
| "completions/mean_length": 2516.166748046875, | |
| "completions/mean_terminated_length": 2516.166748046875, | |
| "completions/min_length": 953.0, | |
| "completions/min_terminated_length": 953.0, | |
| "entropy": 0.27825580164790154, | |
| "epoch": 0.026170798898071626, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.16189513218269783, | |
| "kl": 0.0023431787849403918, | |
| "learning_rate": 9.984840253435568e-07, | |
| "loss": 0.0076, | |
| "num_tokens": 1653054.0, | |
| "reward": 2.125, | |
| "reward_std": 0.17251639068126678, | |
| "rewards/cloze_reward/mean": 0.6666666865348816, | |
| "rewards/cloze_reward/std": 0.4815434217453003, | |
| "rewards/code_reward/mean": 0.4583333432674408, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999192357063293, | |
| "sampling/importance_sampling_ratio/min": 0.06552005559206009, | |
| "sampling/sampling_logp_difference/max": 2.7253990173339844, | |
| "sampling/sampling_logp_difference/mean": 0.013308782130479813, | |
| "step": 19 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5089.0, | |
| "completions/max_terminated_length": 5089.0, | |
| "completions/mean_length": 1440.9583740234375, | |
| "completions/mean_terminated_length": 1440.9583740234375, | |
| "completions/min_length": 896.0, | |
| "completions/min_terminated_length": 896.0, | |
| "entropy": 0.2597235403954983, | |
| "epoch": 0.027548209366391185, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.3058200238239626, | |
| "kl": 0.004811614111531526, | |
| "learning_rate": 9.983110023102145e-07, | |
| "loss": -0.0732, | |
| "num_tokens": 1709421.0, | |
| "reward": 2.0833334922790527, | |
| "reward_std": 0.6257078647613525, | |
| "rewards/cloze_reward/mean": 0.5833333134651184, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0002235174179077, | |
| "sampling/importance_sampling_ratio/min": 0.010991987772285938, | |
| "sampling/sampling_logp_difference/max": 4.510588645935059, | |
| "sampling/sampling_logp_difference/mean": 0.012450532987713814, | |
| "step": 20 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 6266.0, | |
| "completions/mean_length": 2971.95849609375, | |
| "completions/mean_terminated_length": 2745.0, | |
| "completions/min_length": 725.0, | |
| "completions/min_terminated_length": 725.0, | |
| "entropy": 0.23062221333384514, | |
| "epoch": 0.028925619834710745, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.20922776699151296, | |
| "kl": 0.002331310883164406, | |
| "learning_rate": 9.981286483174148e-07, | |
| "loss": 0.0302, | |
| "num_tokens": 1808892.0, | |
| "reward": 1.6666667461395264, | |
| "reward_std": 0.6947464346885681, | |
| "rewards/cloze_reward/mean": 0.1666666716337204, | |
| "rewards/cloze_reward/std": 0.3806934952735901, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.999830424785614, | |
| "sampling/importance_sampling_ratio/min": 5.94212829982399e-13, | |
| "sampling/sampling_logp_difference/max": 28.151538848876953, | |
| "sampling/sampling_logp_difference/mean": 0.013190208002924919, | |
| "step": 21 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3810.0, | |
| "completions/max_terminated_length": 3810.0, | |
| "completions/mean_length": 2031.5, | |
| "completions/mean_terminated_length": 2031.5, | |
| "completions/min_length": 1207.0, | |
| "completions/min_terminated_length": 1207.0, | |
| "entropy": 0.1824016384780407, | |
| "epoch": 0.030303030303030304, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2952973607886351, | |
| "kl": 0.004044964851345867, | |
| "learning_rate": 9.979369667797674e-07, | |
| "loss": 0.0554, | |
| "num_tokens": 1911664.0, | |
| "reward": 1.5, | |
| "reward_std": 0.48678088188171387, | |
| "rewards/cloze_reward/mean": 0.0, | |
| "rewards/cloze_reward/std": 0.0, | |
| "rewards/code_reward/mean": 0.5, | |
| "rewards/code_reward/std": 0.5107539296150208, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9998181462287903, | |
| "sampling/importance_sampling_ratio/min": 0.0026749928947538137, | |
| "sampling/sampling_logp_difference/max": 5.923808574676514, | |
| "sampling/sampling_logp_difference/mean": 0.011196551844477654, | |
| "step": 22 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7742.0, | |
| "completions/max_terminated_length": 7742.0, | |
| "completions/mean_length": 2664.791748046875, | |
| "completions/mean_terminated_length": 2664.791748046875, | |
| "completions/min_length": 716.0, | |
| "completions/min_terminated_length": 716.0, | |
| "entropy": 0.16602139174938202, | |
| "epoch": 0.03168044077134986, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.18646426939832977, | |
| "kl": 0.0027883301954716444, | |
| "learning_rate": 9.977359612865422e-07, | |
| "loss": -0.1659, | |
| "num_tokens": 2005483.0, | |
| "reward": 2.0, | |
| "reward_std": 0.34503278136253357, | |
| "rewards/cloze_reward/mean": 0.5416666865348816, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.4583333432674408, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000344514846802, | |
| "sampling/importance_sampling_ratio/min": 2.7512096333753844e-12, | |
| "sampling/sampling_logp_difference/max": 26.618980407714844, | |
| "sampling/sampling_logp_difference/mean": 0.010194670408964157, | |
| "step": 23 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 7718.0, | |
| "completions/mean_length": 4289.5, | |
| "completions/mean_terminated_length": 4119.826171875, | |
| "completions/min_length": 979.0, | |
| "completions/min_terminated_length": 979.0, | |
| "entropy": 0.2995450422167778, | |
| "epoch": 0.03305785123966942, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.170700316120424, | |
| "kl": 0.0018004105950240046, | |
| "learning_rate": 9.975256356016017e-07, | |
| "loss": 0.0699, | |
| "num_tokens": 2128183.0, | |
| "reward": 2.0, | |
| "reward_std": 0.6288648843765259, | |
| "rewards/cloze_reward/mean": 0.4583333432674408, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.5833333134651184, | |
| "rewards/code_reward/std": 0.5036101341247559, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999317526817322, | |
| "sampling/importance_sampling_ratio/min": 0.08646544814109802, | |
| "sampling/sampling_logp_difference/max": 2.4480104446411133, | |
| "sampling/sampling_logp_difference/mean": 0.014326645992696285, | |
| "step": 24 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7758.0, | |
| "completions/max_terminated_length": 7758.0, | |
| "completions/mean_length": 3043.45849609375, | |
| "completions/mean_terminated_length": 3043.45849609375, | |
| "completions/min_length": 1282.0, | |
| "completions/min_terminated_length": 1282.0, | |
| "entropy": 0.2883247286081314, | |
| "epoch": 0.03443526170798898, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.12988436998967914, | |
| "kl": 0.0025270123151130974, | |
| "learning_rate": 9.973059936633306e-07, | |
| "loss": 0.0545, | |
| "num_tokens": 2218418.0, | |
| "reward": 2.2916667461395264, | |
| "reward_std": 0.21362332999706268, | |
| "rewards/cloze_reward/mean": 0.75, | |
| "rewards/cloze_reward/std": 0.4423258602619171, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999663829803467, | |
| "sampling/importance_sampling_ratio/min": 0.10468851774930954, | |
| "sampling/sampling_logp_difference/max": 2.256765842437744, | |
| "sampling/sampling_logp_difference/mean": 0.012674104422330856, | |
| "step": 25 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 4710.0, | |
| "completions/mean_length": 2069.625, | |
| "completions/mean_terminated_length": 1803.434814453125, | |
| "completions/min_length": 935.0, | |
| "completions/min_terminated_length": 935.0, | |
| "entropy": 0.1856878437101841, | |
| "epoch": 0.03581267217630854, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.2189265719041607, | |
| "kl": 0.0038806990487501025, | |
| "learning_rate": 9.970770395845622e-07, | |
| "loss": 0.3118, | |
| "num_tokens": 2309321.0, | |
| "reward": 2.375, | |
| "reward_std": 0.5061727166175842, | |
| "rewards/cloze_reward/mean": 0.5833333134651184, | |
| "rewards/cloze_reward/std": 0.5036101341247559, | |
| "rewards/code_reward/mean": 0.8333333134651184, | |
| "rewards/code_reward/std": 0.3806934952735901, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9998181462287903, | |
| "sampling/importance_sampling_ratio/min": 0.23940998315811157, | |
| "sampling/sampling_logp_difference/max": 1.4295778274536133, | |
| "sampling/sampling_logp_difference/mean": 0.009506626054644585, | |
| "step": 26 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 8177.0, | |
| "completions/mean_length": 4914.20849609375, | |
| "completions/mean_terminated_length": 4771.69580078125, | |
| "completions/min_length": 1792.0, | |
| "completions/min_terminated_length": 1792.0, | |
| "entropy": 0.3651472330093384, | |
| "epoch": 0.0371900826446281, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.16794783472528907, | |
| "kl": 0.0032920141238719225, | |
| "learning_rate": 9.968387776525007e-07, | |
| "loss": 0.0044, | |
| "num_tokens": 2447670.0, | |
| "reward": 1.7083333730697632, | |
| "reward_std": 0.6274997591972351, | |
| "rewards/cloze_reward/mean": 0.5, | |
| "rewards/cloze_reward/std": 0.5107539296150208, | |
| "rewards/code_reward/mean": 0.2916666567325592, | |
| "rewards/code_reward/std": 0.4643056094646454, | |
| "rewards/format_reward/mean": 0.9166666865348816, | |
| "rewards/format_reward/std": 0.28232985734939575, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9998521208763123, | |
| "sampling/importance_sampling_ratio/min": 0.00022299536794889718, | |
| "sampling/sampling_logp_difference/max": 8.40835952758789, | |
| "sampling/sampling_logp_difference/mean": 0.017919987440109253, | |
| "step": 27 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7702.0, | |
| "completions/max_terminated_length": 7702.0, | |
| "completions/mean_length": 3003.375, | |
| "completions/mean_terminated_length": 3003.375, | |
| "completions/min_length": 1013.0, | |
| "completions/min_terminated_length": 1013.0, | |
| "entropy": 0.30894363671541214, | |
| "epoch": 0.03856749311294766, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.2662075831826717, | |
| "kl": 0.003954145242460072, | |
| "learning_rate": 9.965912123286424e-07, | |
| "loss": 0.085, | |
| "num_tokens": 2537663.0, | |
| "reward": 2.5416667461395264, | |
| "reward_std": 0.4082186222076416, | |
| "rewards/cloze_reward/mean": 0.7083333134651184, | |
| "rewards/cloze_reward/std": 0.4643056094646454, | |
| "rewards/code_reward/mean": 0.8333333134651184, | |
| "rewards/code_reward/std": 0.3806934952735901, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000332593917847, | |
| "sampling/importance_sampling_ratio/min": 0.21365012228488922, | |
| "sampling/sampling_logp_difference/max": 2.255774974822998, | |
| "sampling/sampling_logp_difference/mean": 0.014759156852960587, | |
| "step": 28 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0416666679084301, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 7521.0, | |
| "completions/mean_length": 2249.291748046875, | |
| "completions/mean_terminated_length": 1990.9130859375, | |
| "completions/min_length": 932.0, | |
| "completions/min_terminated_length": 932.0, | |
| "entropy": 0.20293106883764267, | |
| "epoch": 0.03994490358126722, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.30621070115238647, | |
| "kl": 0.004532316525001079, | |
| "learning_rate": 9.963343482486905e-07, | |
| "loss": 0.1541, | |
| "num_tokens": 2608934.0, | |
| "reward": 2.4166667461395264, | |
| "reward_std": 0.5028601884841919, | |
| "rewards/cloze_reward/mean": 0.5833333134651184, | |
| "rewards/cloze_reward/std": 0.5036101341247559, | |
| "rewards/code_reward/mean": 0.875, | |
| "rewards/code_reward/std": 0.337831974029541, | |
| "rewards/format_reward/mean": 0.9583333134651184, | |
| "rewards/format_reward/std": 0.20412415266036987, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.000059962272644, | |
| "sampling/importance_sampling_ratio/min": 0.2822778820991516, | |
| "sampling/sampling_logp_difference/max": 1.3750309944152832, | |
| "sampling/sampling_logp_difference/mean": 0.011287961155176163, | |
| "step": 29 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6912.0, | |
| "completions/max_terminated_length": 6912.0, | |
| "completions/mean_length": 2896.416748046875, | |
| "completions/mean_terminated_length": 2896.416748046875, | |
| "completions/min_length": 1072.0, | |
| "completions/min_terminated_length": 1072.0, | |
| "entropy": 0.24397730827331543, | |
| "epoch": 0.04132231404958678, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2637496585585402, | |
| "kl": 0.0052707926370203495, | |
| "learning_rate": 9.96068190222469e-07, | |
| "loss": -0.0809, | |
| "num_tokens": 2696368.0, | |
| "reward": 2.3333334922790527, | |
| "reward_std": 0.7106520533561707, | |
| "rewards/cloze_reward/mean": 0.7916666865348816, | |
| "rewards/cloze_reward/std": 0.4148511290550232, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.000116229057312, | |
| "sampling/importance_sampling_ratio/min": 0.26414525508880615, | |
| "sampling/sampling_logp_difference/max": 1.3681449890136719, | |
| "sampling/sampling_logp_difference/mean": 0.012104719877243042, | |
| "step": 30 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5352.0, | |
| "completions/max_terminated_length": 5352.0, | |
| "completions/mean_length": 1967.75, | |
| "completions/mean_terminated_length": 1967.75, | |
| "completions/min_length": 804.0, | |
| "completions/min_terminated_length": 804.0, | |
| "entropy": 0.21574077382683754, | |
| "epoch": 0.04269972451790634, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.12651354951166632, | |
| "kl": 0.0038389787659980357, | |
| "learning_rate": 9.95792743233833e-07, | |
| "loss": 0.1096, | |
| "num_tokens": 2766314.0, | |
| "reward": 2.4166667461395264, | |
| "reward_std": 0.15430335700511932, | |
| "rewards/cloze_reward/mean": 0.4166666567325592, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 1.0, | |
| "rewards/code_reward/std": 0.0, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9996727108955383, | |
| "sampling/importance_sampling_ratio/min": 0.11743912100791931, | |
| "sampling/sampling_logp_difference/max": 2.1418352127075195, | |
| "sampling/sampling_logp_difference/mean": 0.011217731051146984, | |
| "step": 31 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6975.0, | |
| "completions/max_terminated_length": 6975.0, | |
| "completions/mean_length": 2998.25, | |
| "completions/mean_terminated_length": 2998.25, | |
| "completions/min_length": 1004.0, | |
| "completions/min_terminated_length": 1004.0, | |
| "entropy": 0.34803634136915207, | |
| "epoch": 0.0440771349862259, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.18832712543023003, | |
| "kl": 0.003418200241867453, | |
| "learning_rate": 9.95508012440575e-07, | |
| "loss": 0.0195, | |
| "num_tokens": 2858208.0, | |
| "reward": 2.25, | |
| "reward_std": 0.5748276710510254, | |
| "rewards/cloze_reward/mean": 0.625, | |
| "rewards/cloze_reward/std": 0.494535356760025, | |
| "rewards/code_reward/mean": 0.625, | |
| "rewards/code_reward/std": 0.494535356760025, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0001078844070435, | |
| "sampling/importance_sampling_ratio/min": 0.33018070459365845, | |
| "sampling/sampling_logp_difference/max": 1.1081151962280273, | |
| "sampling/sampling_logp_difference/mean": 0.01556301862001419, | |
| "step": 32 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4436.0, | |
| "completions/max_terminated_length": 4436.0, | |
| "completions/mean_length": 2222.33349609375, | |
| "completions/mean_terminated_length": 2222.33349609375, | |
| "completions/min_length": 961.0, | |
| "completions/min_terminated_length": 961.0, | |
| "entropy": 0.2771223820745945, | |
| "epoch": 0.045454545454545456, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.17495461933111076, | |
| "kl": 0.0048337242333218455, | |
| "learning_rate": 9.95214003174328e-07, | |
| "loss": -0.0133, | |
| "num_tokens": 2930472.0, | |
| "reward": 2.5, | |
| "reward_std": 0.36585909128189087, | |
| "rewards/cloze_reward/mean": 0.6666666865348816, | |
| "rewards/cloze_reward/std": 0.4815434217453003, | |
| "rewards/code_reward/mean": 0.8333333134651184, | |
| "rewards/code_reward/std": 0.3806934952735901, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9997199177742004, | |
| "sampling/importance_sampling_ratio/min": 0.17066244781017303, | |
| "sampling/sampling_logp_difference/max": 2.808061122894287, | |
| "sampling/sampling_logp_difference/mean": 0.012434008531272411, | |
| "step": 33 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6776.0, | |
| "completions/max_terminated_length": 6776.0, | |
| "completions/mean_length": 3140.70849609375, | |
| "completions/mean_terminated_length": 3140.70849609375, | |
| "completions/min_length": 726.0, | |
| "completions/min_terminated_length": 726.0, | |
| "entropy": 0.28713061660528183, | |
| "epoch": 0.046831955922865015, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.5763177612619246, | |
| "kl": 0.005852235772181302, | |
| "learning_rate": 9.949107209404663e-07, | |
| "loss": 0.0443, | |
| "num_tokens": 3043473.0, | |
| "reward": 2.0416667461395264, | |
| "reward_std": 0.5222300291061401, | |
| "rewards/cloze_reward/mean": 0.5, | |
| "rewards/cloze_reward/std": 0.5107539296150208, | |
| "rewards/code_reward/mean": 0.5416666865348816, | |
| "rewards/code_reward/std": 0.5089773535728455, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0002092123031616, | |
| "sampling/importance_sampling_ratio/min": 0.010155132971704006, | |
| "sampling/sampling_logp_difference/max": 4.589776039123535, | |
| "sampling/sampling_logp_difference/mean": 0.012913873419165611, | |
| "step": 34 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0833333358168602, | |
| "completions/max_length": 8192.0, | |
| "completions/max_terminated_length": 7808.0, | |
| "completions/mean_length": 2782.75, | |
| "completions/mean_terminated_length": 2291.0, | |
| "completions/min_length": 736.0, | |
| "completions/min_terminated_length": 736.0, | |
| "entropy": 0.23918093740940094, | |
| "epoch": 0.048209366391184574, | |
| "frac_reward_zero_std": 0.6666666865348816, | |
| "grad_norm": 0.19663548974567993, | |
| "kl": 0.004491619183681905, | |
| "learning_rate": 9.94598171418002e-07, | |
| "loss": 0.1035, | |
| "num_tokens": 3126827.0, | |
| "reward": 2.1666667461395264, | |
| "reward_std": 0.35634833574295044, | |
| "rewards/cloze_reward/mean": 0.4166666567325592, | |
| "rewards/cloze_reward/std": 0.5036101937294006, | |
| "rewards/code_reward/mean": 0.8333333134651184, | |
| "rewards/code_reward/std": 0.3806934952735901, | |
| "rewards/format_reward/mean": 0.9166666865348816, | |
| "rewards/format_reward/std": 0.28232985734939575, | |
| "sampling/importance_sampling_ratio/max": 1.8434656858444214, | |
| "sampling/importance_sampling_ratio/mean": 1.0001463890075684, | |
| "sampling/importance_sampling_ratio/min": 0.14892546832561493, | |
| "sampling/sampling_logp_difference/max": 1.9043092727661133, | |
| "sampling/sampling_logp_difference/mean": 0.011121454648673534, | |
| "step": 35 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6397.0, | |
| "completions/max_terminated_length": 6397.0, | |
| "completions/mean_length": 2130.70849609375, | |
| "completions/mean_terminated_length": 2130.70849609375, | |
| "completions/min_length": 908.0, | |
| "completions/min_terminated_length": 908.0, | |
| "entropy": 0.25559694319963455, | |
| "epoch": 0.049586776859504134, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.20044650714243367, | |
| "kl": 0.004921266925521195, | |
| "learning_rate": 9.94276360459479e-07, | |
| "loss": 0.0668, | |
| "num_tokens": 3204780.0, | |
| "reward": 2.0, | |
| "reward_std": 0.34503278136253357, | |
| "rewards/cloze_reward/mean": 0.125, | |
| "rewards/cloze_reward/std": 0.337831974029541, | |
| "rewards/code_reward/mean": 0.875, | |
| "rewards/code_reward/std": 0.337831974029541, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9998623728752136, | |
| "sampling/importance_sampling_ratio/min": 0.17689232528209686, | |
| "sampling/sampling_logp_difference/max": 1.732214093208313, | |
| "sampling/sampling_logp_difference/mean": 0.012361792847514153, | |
| "step": 36 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 4222.0, | |
| "completions/max_terminated_length": 4222.0, | |
| "completions/mean_length": 1950.666748046875, | |
| "completions/mean_terminated_length": 1950.666748046875, | |
| "completions/min_length": 1084.0, | |
| "completions/min_terminated_length": 1084.0, | |
| "entropy": 0.2424805425107479, | |
| "epoch": 0.05096418732782369, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.2413970223564091, | |
| "kl": 0.004224332165904343, | |
| "learning_rate": 9.939452940908626e-07, | |
| "loss": 0.0541, | |
| "num_tokens": 3282572.0, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.48678088188171387, | |
| "rewards/cloze_reward/mean": 0.0833333358168602, | |
| "rewards/cloze_reward/std": 0.28232985734939575, | |
| "rewards/code_reward/mean": 0.75, | |
| "rewards/code_reward/std": 0.4423258602619171, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.999829113483429, | |
| "sampling/importance_sampling_ratio/min": 0.16516301035881042, | |
| "sampling/sampling_logp_difference/max": 1.800822377204895, | |
| "sampling/sampling_logp_difference/mean": 0.01168552041053772, | |
| "step": 37 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 6989.0, | |
| "completions/max_terminated_length": 6989.0, | |
| "completions/mean_length": 2920.041748046875, | |
| "completions/mean_terminated_length": 2920.041748046875, | |
| "completions/min_length": 1364.0, | |
| "completions/min_terminated_length": 1364.0, | |
| "entropy": 0.2422230765223503, | |
| "epoch": 0.05234159779614325, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.16283138475922154, | |
| "kl": 0.004039893392473459, | |
| "learning_rate": 9.936049785114279e-07, | |
| "loss": 0.0391, | |
| "num_tokens": 3380365.0, | |
| "reward": 1.8333333730697632, | |
| "reward_std": 0.43015047907829285, | |
| "rewards/cloze_reward/mean": 0.125, | |
| "rewards/cloze_reward/std": 0.337831974029541, | |
| "rewards/code_reward/mean": 0.7083333134651184, | |
| "rewards/code_reward/std": 0.4643056094646454, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9998534321784973, | |
| "sampling/importance_sampling_ratio/min": 0.11749733984470367, | |
| "sampling/sampling_logp_difference/max": 2.1413395404815674, | |
| "sampling/sampling_logp_difference/mean": 0.012631865218281746, | |
| "step": 38 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7304.0, | |
| "completions/max_terminated_length": 7304.0, | |
| "completions/mean_length": 3322.041748046875, | |
| "completions/mean_terminated_length": 3322.041748046875, | |
| "completions/min_length": 1.0, | |
| "completions/min_terminated_length": 1.0, | |
| "entropy": 0.4101082310080528, | |
| "epoch": 0.05371900826446281, | |
| "frac_reward_zero_std": 0.0, | |
| "grad_norm": 0.181795370019662, | |
| "kl": 0.0036983516183681786, | |
| "learning_rate": 9.932554200936426e-07, | |
| "loss": -0.1581, | |
| "num_tokens": 3488278.0, | |
| "reward": 1.2083333730697632, | |
| "reward_std": 0.6043562889099121, | |
| "rewards/cloze_reward/mean": 0.1666666716337204, | |
| "rewards/cloze_reward/std": 0.3806934952735901, | |
| "rewards/code_reward/mean": 0.375, | |
| "rewards/code_reward/std": 0.494535356760025, | |
| "rewards/format_reward/mean": 0.6666666865348816, | |
| "rewards/format_reward/std": 0.4815434217453003, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 0.9999675154685974, | |
| "sampling/importance_sampling_ratio/min": 0.04754088446497917, | |
| "sampling/sampling_logp_difference/max": 3.0461652278900146, | |
| "sampling/sampling_logp_difference/mean": 0.01610734686255455, | |
| "step": 39 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5011.0, | |
| "completions/max_terminated_length": 5011.0, | |
| "completions/mean_length": 2942.58349609375, | |
| "completions/mean_terminated_length": 2942.58349609375, | |
| "completions/min_length": 1454.0, | |
| "completions/min_terminated_length": 1454.0, | |
| "entropy": 0.35880884528160095, | |
| "epoch": 0.05509641873278237, | |
| "frac_reward_zero_std": 0.3333333432674408, | |
| "grad_norm": 0.17000406623354494, | |
| "kl": 0.003319010080303997, | |
| "learning_rate": 9.92896625383049e-07, | |
| "loss": -0.0559, | |
| "num_tokens": 3575956.0, | |
| "reward": 1.875, | |
| "reward_std": 0.4023112952709198, | |
| "rewards/cloze_reward/mean": 0.4583333432674408, | |
| "rewards/cloze_reward/std": 0.5089773535728455, | |
| "rewards/code_reward/mean": 0.4166666567325592, | |
| "rewards/code_reward/std": 0.5036101937294006, | |
| "rewards/format_reward/mean": 1.0, | |
| "rewards/format_reward/std": 0.0, | |
| "sampling/importance_sampling_ratio/max": 2.0, | |
| "sampling/importance_sampling_ratio/mean": 1.0000556707382202, | |
| "sampling/importance_sampling_ratio/min": 0.27387282252311707, | |
| "sampling/sampling_logp_difference/max": 1.2950913906097412, | |
| "sampling/sampling_logp_difference/mean": 0.01579936034977436, | |
| "step": 40 | |
| } | |
| ], | |
| "logging_steps": 1.0, | |
| "max_steps": 726, | |
| "num_input_tokens_seen": 3575956, | |
| "num_train_epochs": 1, | |
| "save_steps": 20, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |