{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.28150991682661547, "eval_steps": 500, "global_step": 440, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5742.0, "completions/max_terminated_length": 5742.0, "completions/mean_length": 2397.95849609375, "completions/mean_terminated_length": 2397.95849609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3080432079732418, "epoch": 0.0006397952655150352, "frac_reward_zero_std": 0.0, "grad_norm": 0.1001091835451251, "kl": 0.0, "learning_rate": 1e-06, "loss": -0.1038, "num_tokens": 70879.0, "reward": 0.9583333730697632, "reward_std": 1.0317248106002808, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.6883533000946045, "sampling/importance_sampling_ratio/mean": 0.9999645352363586, "sampling/importance_sampling_ratio/min": 0.39933526515960693, "sampling/sampling_logp_difference/max": 0.9179539680480957, "sampling/sampling_logp_difference/mean": 0.007980454713106155, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3038.0, "completions/max_terminated_length": 3038.0, "completions/mean_length": 1852.666748046875, "completions/mean_terminated_length": 1852.666748046875, "completions/min_length": 1244.0, "completions/min_terminated_length": 1244.0, "entropy": 0.245514877140522, "epoch": 0.0012795905310300703, "frac_reward_zero_std": 0.0, "grad_norm": 0.09456785723106247, "kl": 0.00011274554162810091, "learning_rate": 9.999989899993534e-07, "loss": 0.0429, "num_tokens": 125727.0, "reward": 0.8333333730697632, "reward_std": 0.8924775123596191, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.6885267496109009, "sampling/importance_sampling_ratio/mean": 0.9999716877937317, "sampling/importance_sampling_ratio/min": 0.5457502007484436, "sampling/sampling_logp_difference/max": 0.6055939197540283, "sampling/sampling_logp_difference/mean": 0.0075882067903876305, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7380.0, "completions/mean_length": 3249.25, "completions/mean_terminated_length": 3034.347900390625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4153172969818115, "epoch": 0.0019193857965451055, "frac_reward_zero_std": 0.0, "grad_norm": 0.10341984626292189, "kl": 0.00010614286657073535, "learning_rate": 9.99995960001494e-07, "loss": -0.1326, "num_tokens": 220269.0, "reward": 0.3333333432674408, "reward_std": 0.4714045226573944, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.600479245185852, "sampling/importance_sampling_ratio/mean": 0.9999856948852539, "sampling/importance_sampling_ratio/min": 0.6413337588310242, "sampling/sampling_logp_difference/max": 0.4703030586242676, "sampling/sampling_logp_difference/mean": 0.011051096022129059, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7942.0, "completions/mean_length": 2937.125, "completions/mean_terminated_length": 2459.4091796875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6708895936608315, "epoch": 0.0025591810620601407, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10455285493611831, "kl": 0.00024925392972363625, "learning_rate": 9.999909100186631e-07, "loss": -0.0919, "num_tokens": 308936.0, "reward": 0.4583333432674408, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00020170211792, "sampling/importance_sampling_ratio/min": 0.495281457901001, "sampling/sampling_logp_difference/max": 0.7764825820922852, "sampling/sampling_logp_difference/mean": 0.010273557156324387, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7844.0, "completions/max_terminated_length": 7844.0, "completions/mean_length": 4063.166748046875, "completions/mean_terminated_length": 4063.166748046875, "completions/min_length": 1542.0, "completions/min_terminated_length": 1542.0, "entropy": 0.47108251601457596, "epoch": 0.003198976327575176, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07709342883119034, "kl": 0.00016283100558212027, "learning_rate": 9.999838400712626e-07, "loss": -0.0458, "num_tokens": 421004.0, "reward": 0.1666666716337204, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.125, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.785794734954834, "sampling/importance_sampling_ratio/mean": 0.9998542666435242, "sampling/importance_sampling_ratio/min": 0.5220168828964233, "sampling/sampling_logp_difference/max": 0.6500552892684937, "sampling/sampling_logp_difference/mean": 0.013157735578715801, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6535.0, "completions/mean_length": 3868.625, "completions/mean_terminated_length": 3680.65234375, "completions/min_length": 1101.0, "completions/min_terminated_length": 1101.0, "entropy": 0.40071048587560654, "epoch": 0.003838771593090211, "frac_reward_zero_std": 0.0, "grad_norm": 0.08764560553852271, "kl": 0.00012013058767479379, "learning_rate": 9.99974750187855e-07, "loss": -0.0506, "num_tokens": 537051.0, "reward": 0.75, "reward_std": 0.725618839263916, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999076724052429, "sampling/importance_sampling_ratio/min": 0.36530813574790955, "sampling/sampling_logp_difference/max": 1.0070140361785889, "sampling/sampling_logp_difference/mean": 0.010840469971299171, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 6036.0, "completions/mean_length": 3555.25, "completions/mean_terminated_length": 2892.857177734375, "completions/min_length": 723.0, "completions/min_terminated_length": 723.0, "entropy": 0.3351628929376602, "epoch": 0.004478566858605247, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.04525968163672171, "kl": 9.81534267339157e-05, "learning_rate": 9.999636404051636e-07, "loss": -0.0077, "num_tokens": 643265.0, "reward": 0.5, "reward_std": 0.4232262969017029, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.7433435916900635, "sampling/importance_sampling_ratio/mean": 0.9999358057975769, "sampling/importance_sampling_ratio/min": 0.47315531969070435, "sampling/sampling_logp_difference/max": 0.7483315467834473, "sampling/sampling_logp_difference/mean": 0.00898043718189001, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6467.0, "completions/mean_length": 3544.916748046875, "completions/mean_terminated_length": 3122.45458984375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5435673668980598, "epoch": 0.005118362124120281, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.04439660256976967, "kl": 0.00013506251343642361, "learning_rate": 9.999505107680721e-07, "loss": 0.0491, "num_tokens": 744359.0, "reward": 0.1666666716337204, "reward_std": 0.30860671401023865, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.0833333358168602, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.9946341514587402, "sampling/importance_sampling_ratio/mean": 1.0000513792037964, "sampling/importance_sampling_ratio/min": 0.33995750546455383, "sampling/sampling_logp_difference/max": 1.078934669494629, "sampling/sampling_logp_difference/mean": 0.012841835618019104, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5410.0, "completions/max_terminated_length": 5410.0, "completions/mean_length": 2651.0, "completions/mean_terminated_length": 2651.0, "completions/min_length": 1320.0, "completions/min_terminated_length": 1320.0, "entropy": 0.3098254017531872, "epoch": 0.005758157389635317, "frac_reward_zero_std": 0.0, "grad_norm": 0.10143198832943535, "kl": 0.00012171554590167943, "learning_rate": 9.999353613296237e-07, "loss": -0.0127, "num_tokens": 823191.0, "reward": 0.7083333730697632, "reward_std": 0.6461589932441711, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.6638469696044922, "sampling/importance_sampling_ratio/mean": 1.0000923871994019, "sampling/importance_sampling_ratio/min": 0.5968568325042725, "sampling/sampling_logp_difference/max": 0.516077995300293, "sampling/sampling_logp_difference/mean": 0.0093992343172431, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7603.0, "completions/max_terminated_length": 7603.0, "completions/mean_length": 2988.916748046875, "completions/mean_terminated_length": 2988.916748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5824523195624352, "epoch": 0.006397952655150352, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.08070081823836218, "kl": 0.00011698708476615138, "learning_rate": 9.999181921510229e-07, "loss": -0.0458, "num_tokens": 915813.0, "reward": 0.7916666865348816, "reward_std": 0.17251639068126678, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.512447476387024, "sampling/importance_sampling_ratio/mean": 1.00006103515625, "sampling/importance_sampling_ratio/min": 0.6058946847915649, "sampling/sampling_logp_difference/max": 0.5010490417480469, "sampling/sampling_logp_difference/mean": 0.010886563919484615, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5323.0, "completions/max_terminated_length": 5323.0, "completions/mean_length": 2369.08349609375, "completions/mean_terminated_length": 2369.08349609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.48641378059983253, "epoch": 0.007037747920665387, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07066960727296293, "kl": 0.00010471514906384982, "learning_rate": 9.998990033016325e-07, "loss": -0.0644, "num_tokens": 991911.0, "reward": 0.5833333730697632, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000642538070679, "sampling/importance_sampling_ratio/min": 0.5733522772789001, "sampling/sampling_logp_difference/max": 1.0622358322143555, "sampling/sampling_logp_difference/mean": 0.008501792326569557, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7179.0, "completions/max_terminated_length": 7179.0, "completions/mean_length": 3062.125, "completions/mean_terminated_length": 3062.125, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.40063896030187607, "epoch": 0.007677543186180422, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05944570599723367, "kl": 0.0004441710370883811, "learning_rate": 9.99877794858976e-07, "loss": 0.0144, "num_tokens": 1088106.0, "reward": 0.1666666716337204, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.125, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5156644582748413, "sampling/importance_sampling_ratio/mean": 1.0000349283218384, "sampling/importance_sampling_ratio/min": 0.4451243579387665, "sampling/sampling_logp_difference/max": 0.8094016313552856, "sampling/sampling_logp_difference/mean": 0.010357961058616638, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7039.0, "completions/max_terminated_length": 7039.0, "completions/mean_length": 2906.5, "completions/mean_terminated_length": 2906.5, "completions/min_length": 813.0, "completions/min_terminated_length": 813.0, "entropy": 0.3861211910843849, "epoch": 0.008317338451695458, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0559949944548977, "kl": 0.00013091588880342897, "learning_rate": 9.998545669087356e-07, "loss": 0.001, "num_tokens": 1169422.0, "reward": 0.7916666865348816, "reward_std": 0.24800793826580048, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.7533085346221924, "sampling/importance_sampling_ratio/mean": 0.9998784065246582, "sampling/importance_sampling_ratio/min": 0.5587930083274841, "sampling/sampling_logp_difference/max": 0.5819761753082275, "sampling/sampling_logp_difference/mean": 0.010653229430317879, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5677.0, "completions/max_terminated_length": 5677.0, "completions/mean_length": 2932.375, "completions/mean_terminated_length": 2932.375, "completions/min_length": 933.0, "completions/min_terminated_length": 933.0, "entropy": 0.4752734676003456, "epoch": 0.008957133717210493, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08824438882152477, "kl": 0.0001526698724774178, "learning_rate": 9.998293195447518e-07, "loss": 0.0295, "num_tokens": 1249199.0, "reward": 0.625, "reward_std": 0.6139818429946899, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.5076181888580322, "sampling/importance_sampling_ratio/mean": 1.0001312494277954, "sampling/importance_sampling_ratio/min": 0.6113912463188171, "sampling/sampling_logp_difference/max": 0.4920182228088379, "sampling/sampling_logp_difference/mean": 0.01304303016513586, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 7761.0, "completions/mean_length": 4476.2919921875, "completions/mean_terminated_length": 3733.150146484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.661351889371872, "epoch": 0.009596928982725527, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.04816371903495797, "kl": 0.00014781709614908323, "learning_rate": 9.998020528690244e-07, "loss": 0.0362, "num_tokens": 1370278.0, "reward": 0.25, "reward_std": 0.4232262969017029, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.1666666716337204, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000642538070679, "sampling/importance_sampling_ratio/min": 0.36398521065711975, "sampling/sampling_logp_difference/max": 1.0106420516967773, "sampling/sampling_logp_difference/mean": 0.011925773695111275, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6109.0, "completions/max_terminated_length": 6109.0, "completions/mean_length": 2761.375, "completions/mean_terminated_length": 2761.375, "completions/min_length": 1202.0, "completions/min_terminated_length": 1202.0, "entropy": 0.257974810898304, "epoch": 0.010236724248240563, "frac_reward_zero_std": 0.0, "grad_norm": 0.08730152603078431, "kl": 9.924870573740918e-05, "learning_rate": 9.997727669917109e-07, "loss": -0.0303, "num_tokens": 1455527.0, "reward": 0.9166666865348816, "reward_std": 0.7326550483703613, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4525519609451294, "sampling/importance_sampling_ratio/mean": 0.9999193549156189, "sampling/importance_sampling_ratio/min": 0.29509884119033813, "sampling/sampling_logp_difference/max": 1.220444917678833, "sampling/sampling_logp_difference/mean": 0.007625948172062635, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4893.0, "completions/max_terminated_length": 4893.0, "completions/mean_length": 1717.7083740234375, "completions/mean_terminated_length": 1717.7083740234375, "completions/min_length": 691.0, "completions/min_terminated_length": 691.0, "entropy": 0.2683862894773483, "epoch": 0.010876519513755598, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.051909705152346365, "kl": 0.0001364501458738232, "learning_rate": 9.997414620311261e-07, "loss": -0.0206, "num_tokens": 1513496.0, "reward": 1.2083333730697632, "reward_std": 0.17251639068126678, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.595645785331726, "sampling/importance_sampling_ratio/mean": 1.000101089477539, "sampling/importance_sampling_ratio/min": 0.46683546900749207, "sampling/sampling_logp_difference/max": 0.7617783546447754, "sampling/sampling_logp_difference/mean": 0.008072523400187492, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5487.0, "completions/mean_length": 2605.45849609375, "completions/mean_terminated_length": 2097.591064453125, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3296247459948063, "epoch": 0.011516314779270634, "frac_reward_zero_std": 0.0, "grad_norm": 0.08210294729997676, "kl": 8.948578397394158e-05, "learning_rate": 9.997081381137421e-07, "loss": 0.091, "num_tokens": 1595819.0, "reward": 0.625, "reward_std": 0.7825304269790649, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.580452561378479, "sampling/importance_sampling_ratio/mean": 0.9999315142631531, "sampling/importance_sampling_ratio/min": 0.6033550500869751, "sampling/sampling_logp_difference/max": 0.5052495002746582, "sampling/sampling_logp_difference/mean": 0.0057948133908212185, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6943.0, "completions/mean_length": 3427.75, "completions/mean_terminated_length": 3220.608642578125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "entropy": 0.4692671597003937, "epoch": 0.01215611004478567, "frac_reward_zero_std": 0.0, "grad_norm": 0.10329745784131909, "kl": 0.00023202858574222773, "learning_rate": 9.996727953741878e-07, "loss": 0.0009, "num_tokens": 1690749.0, "reward": 0.375, "reward_std": 0.42645785212516785, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999127388000488, "sampling/importance_sampling_ratio/min": 0.3403118848800659, "sampling/sampling_logp_difference/max": 1.077892780303955, "sampling/sampling_logp_difference/mean": 0.012087173759937286, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5988.0, "completions/mean_length": 3532.70849609375, "completions/mean_terminated_length": 3330.130615234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5121772661805153, "epoch": 0.012795905310300703, "frac_reward_zero_std": 0.0, "grad_norm": 0.09679831943643168, "kl": 0.000531249177583959, "learning_rate": 9.99635433955248e-07, "loss": -0.0663, "num_tokens": 1788998.0, "reward": 0.6666666865348816, "reward_std": 0.7076864242553711, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998162388801575, "sampling/importance_sampling_ratio/min": 0.08830394595861435, "sampling/sampling_logp_difference/max": 2.4269704818725586, "sampling/sampling_logp_difference/mean": 0.01224372535943985, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3031.0, "completions/max_terminated_length": 3031.0, "completions/mean_length": 1980.375, "completions/mean_terminated_length": 1980.375, "completions/min_length": 392.0, "completions/min_terminated_length": 392.0, "entropy": 0.25175099074840546, "epoch": 0.013435700575815739, "frac_reward_zero_std": 0.0, "grad_norm": 0.10647843812689045, "kl": 0.00015870557763264515, "learning_rate": 9.995960540078626e-07, "loss": -0.0352, "num_tokens": 1855631.0, "reward": 0.625, "reward_std": 0.7532514333724976, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001360177993774, "sampling/importance_sampling_ratio/min": 0.1706869900226593, "sampling/sampling_logp_difference/max": 1.7679238319396973, "sampling/sampling_logp_difference/mean": 0.007579015102237463, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3941.0, "completions/max_terminated_length": 3941.0, "completions/mean_length": 2377.75, "completions/mean_terminated_length": 2377.75, "completions/min_length": 450.0, "completions/min_terminated_length": 450.0, "entropy": 0.3704076409339905, "epoch": 0.014075495841330775, "frac_reward_zero_std": 0.0, "grad_norm": 0.11084646807107873, "kl": 0.0001970504890778102, "learning_rate": 9.995546556911269e-07, "loss": -0.0183, "num_tokens": 1925457.0, "reward": 0.625, "reward_std": 0.7785386443138123, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.7677178382873535, "sampling/importance_sampling_ratio/mean": 1.0000046491622925, "sampling/importance_sampling_ratio/min": 0.00015864685701671988, "sampling/sampling_logp_difference/max": 8.74882984161377, "sampling/sampling_logp_difference/mean": 0.010866448283195496, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5190.0, "completions/max_terminated_length": 5190.0, "completions/mean_length": 2104.75, "completions/mean_terminated_length": 2104.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3608454242348671, "epoch": 0.01471529110684581, "frac_reward_zero_std": 0.0, "grad_norm": 0.1283586866481102, "kl": 0.0002850361343007535, "learning_rate": 9.995112391722904e-07, "loss": -0.0365, "num_tokens": 1986315.0, "reward": 0.6666666865348816, "reward_std": 0.6986123323440552, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5017036199569702, "sampling/importance_sampling_ratio/mean": 1.0000039339065552, "sampling/importance_sampling_ratio/min": 0.5939456224441528, "sampling/sampling_logp_difference/max": 0.5209674835205078, "sampling/sampling_logp_difference/mean": 0.009648634120821953, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5836.0, "completions/max_terminated_length": 5836.0, "completions/mean_length": 2128.541748046875, "completions/mean_terminated_length": 2128.541748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4475998878479004, "epoch": 0.015355086372360844, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.03702990638338413, "kl": 0.0005096941895317286, "learning_rate": 9.994658046267554e-07, "loss": -0.0098, "num_tokens": 2050480.0, "reward": 0.75, "reward_std": 0.15430335700511932, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.8558776378631592, "sampling/importance_sampling_ratio/mean": 1.0000957250595093, "sampling/importance_sampling_ratio/min": 0.5907840728759766, "sampling/sampling_logp_difference/max": 0.6183576583862305, "sampling/sampling_logp_difference/mean": 0.010098821483552456, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6457.0, "completions/max_terminated_length": 6457.0, "completions/mean_length": 2719.70849609375, "completions/mean_terminated_length": 2719.70849609375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3894442319869995, "epoch": 0.01599488163787588, "frac_reward_zero_std": 0.0, "grad_norm": 0.09575341938010082, "kl": 0.0001372909846395487, "learning_rate": 9.994183522380783e-07, "loss": 0.0449, "num_tokens": 2130177.0, "reward": 0.7916666865348816, "reward_std": 0.8520830869674683, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.884177803993225, "sampling/importance_sampling_ratio/mean": 1.0001087188720703, "sampling/importance_sampling_ratio/min": 0.35813385248184204, "sampling/sampling_logp_difference/max": 1.026848554611206, "sampling/sampling_logp_difference/mean": 0.009768148884177208, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3602.0, "completions/max_terminated_length": 3602.0, "completions/mean_length": 1812.0833740234375, "completions/mean_terminated_length": 1812.0833740234375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.31878741830587387, "epoch": 0.016634676903390915, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08406977978461763, "kl": 0.00030571798924938776, "learning_rate": 9.993688821979663e-07, "loss": -0.0764, "num_tokens": 2192083.0, "reward": 1.5833333730697632, "reward_std": 0.487678587436676, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5521268844604492, "sampling/importance_sampling_ratio/mean": 1.000113844871521, "sampling/importance_sampling_ratio/min": 0.3798959255218506, "sampling/sampling_logp_difference/max": 0.9678580164909363, "sampling/sampling_logp_difference/mean": 0.0076737902127206326, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3429.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 2172.45849609375, "completions/mean_terminated_length": 2172.45849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4065181836485863, "epoch": 0.01727447216890595, "frac_reward_zero_std": 0.0, "grad_norm": 0.10998595256603744, "kl": 0.0003296569630037993, "learning_rate": 9.993173947062788e-07, "loss": -0.0381, "num_tokens": 2262566.0, "reward": 0.5833333730697632, "reward_std": 0.7013810873031616, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.6588937044143677, "sampling/importance_sampling_ratio/mean": 0.999814510345459, "sampling/importance_sampling_ratio/min": 0.4355907440185547, "sampling/sampling_logp_difference/max": 0.8310521841049194, "sampling/sampling_logp_difference/mean": 0.00867622159421444, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5872.0, "completions/max_terminated_length": 5872.0, "completions/mean_length": 2431.625, "completions/mean_terminated_length": 2431.625, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.3684285134077072, "epoch": 0.017914267434420986, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.03638707717922571, "kl": 0.00022771044314140454, "learning_rate": 9.992638899710251e-07, "loss": -0.0112, "num_tokens": 2335541.0, "reward": 1.125, "reward_std": 0.3053751587867737, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999198317527771, "sampling/importance_sampling_ratio/min": 0.6221805214881897, "sampling/sampling_logp_difference/max": 0.7190766334533691, "sampling/sampling_logp_difference/mean": 0.01016818918287754, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5974.0, "completions/mean_length": 2495.08349609375, "completions/mean_terminated_length": 2247.391357421875, "completions/min_length": 646.0, "completions/min_terminated_length": 646.0, "entropy": 0.3639747351408005, "epoch": 0.018554062699936022, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05445520762475826, "kl": 0.00024381700495723635, "learning_rate": 9.992083682083648e-07, "loss": -0.0962, "num_tokens": 2405287.0, "reward": 1.0416667461395264, "reward_std": 0.48112308979034424, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.3083051443099976, "sampling/importance_sampling_ratio/mean": 0.9998784065246582, "sampling/importance_sampling_ratio/min": 0.437863290309906, "sampling/sampling_logp_difference/max": 0.8258485794067383, "sampling/sampling_logp_difference/mean": 0.010578814893960953, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7453.0, "completions/max_terminated_length": 7453.0, "completions/mean_length": 2862.541748046875, "completions/mean_terminated_length": 2862.541748046875, "completions/min_length": 1055.0, "completions/min_terminated_length": 1055.0, "entropy": 0.34849873185157776, "epoch": 0.019193857965451054, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08594795251763448, "kl": 0.00022780914150644094, "learning_rate": 9.991508296426057e-07, "loss": 0.0657, "num_tokens": 2491804.0, "reward": 0.5416666865348816, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.6958602666854858, "sampling/importance_sampling_ratio/mean": 1.0000547170639038, "sampling/importance_sampling_ratio/min": 0.4493519067764282, "sampling/sampling_logp_difference/max": 0.7999489307403564, "sampling/sampling_logp_difference/mean": 0.010383294895291328, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4098.0, "completions/max_terminated_length": 4098.0, "completions/mean_length": 1727.0, "completions/mean_terminated_length": 1727.0, "completions/min_length": 671.0, "completions/min_terminated_length": 671.0, "entropy": 0.2520703747868538, "epoch": 0.01983365323096609, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10533397286593542, "kl": 0.00018870570420403965, "learning_rate": 9.990912745062038e-07, "loss": -0.0315, "num_tokens": 2546932.0, "reward": 1.2083333730697632, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5816316604614258, "sampling/importance_sampling_ratio/mean": 0.9998283982276917, "sampling/importance_sampling_ratio/min": 0.46649837493896484, "sampling/sampling_logp_difference/max": 0.7625007629394531, "sampling/sampling_logp_difference/mean": 0.007473343517631292, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5163.0, "completions/mean_length": 2895.625, "completions/mean_terminated_length": 2665.347900390625, "completions/min_length": 1136.0, "completions/min_terminated_length": 1136.0, "entropy": 0.3634922131896019, "epoch": 0.020473448496481125, "frac_reward_zero_std": 0.0, "grad_norm": 0.09410102535979543, "kl": 0.00021498877322301269, "learning_rate": 9.990297030397624e-07, "loss": 0.1112, "num_tokens": 2631683.0, "reward": 0.7916666865348816, "reward_std": 0.654287576675415, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5481774806976318, "sampling/importance_sampling_ratio/mean": 1.0000550746917725, "sampling/importance_sampling_ratio/min": 0.4705967605113983, "sampling/sampling_logp_difference/max": 0.7537537217140198, "sampling/sampling_logp_difference/mean": 0.009767662733793259, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7714.0, "completions/mean_length": 4332.6669921875, "completions/mean_terminated_length": 3981.818359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6037634313106537, "epoch": 0.02111324376199616, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05908410669104177, "kl": 0.0005225270906521473, "learning_rate": 9.989661154920299e-07, "loss": 0.0191, "num_tokens": 2748467.0, "reward": 0.2916666865348816, "reward_std": 0.41331955790519714, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.6249933242797852, "sampling/importance_sampling_ratio/mean": 0.9998361468315125, "sampling/importance_sampling_ratio/min": 0.5560707449913025, "sampling/sampling_logp_difference/max": 0.5868597030639648, "sampling/sampling_logp_difference/mean": 0.010989299044013023, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7213.0, "completions/mean_length": 3820.58349609375, "completions/mean_terminated_length": 3196.09521484375, "completions/min_length": 1576.0, "completions/min_terminated_length": 1576.0, "entropy": 0.33011268079280853, "epoch": 0.021753039027511197, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06995667973576913, "kl": 0.0002211697501479648, "learning_rate": 9.989005121199003e-07, "loss": 0.0476, "num_tokens": 2858417.0, "reward": 0.2083333432674408, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5740724802017212, "sampling/importance_sampling_ratio/mean": 0.9999253749847412, "sampling/importance_sampling_ratio/min": 0.23011836409568787, "sampling/sampling_logp_difference/max": 1.4691615104675293, "sampling/sampling_logp_difference/mean": 0.009538669139146805, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5699.0, "completions/max_terminated_length": 5699.0, "completions/mean_length": 2696.0, "completions/mean_terminated_length": 2696.0, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "entropy": 0.49608713388442993, "epoch": 0.022392834293026232, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10316085882171405, "kl": 0.0003412327350815758, "learning_rate": 9.988328931884115e-07, "loss": 0.094, "num_tokens": 2936905.0, "reward": 0.4583333432674408, "reward_std": 0.46288391947746277, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000045895576477, "sampling/importance_sampling_ratio/min": 0.5803824067115784, "sampling/sampling_logp_difference/max": 0.8827576637268066, "sampling/sampling_logp_difference/mean": 0.012916021049022675, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7923.0, "completions/max_terminated_length": 7923.0, "completions/mean_length": 3778.416748046875, "completions/mean_terminated_length": 3778.416748046875, "completions/min_length": 1325.0, "completions/min_terminated_length": 1325.0, "entropy": 0.4029810503125191, "epoch": 0.023032629558541268, "frac_reward_zero_std": 0.0, "grad_norm": 0.09241918705604697, "kl": 0.00023375315504381433, "learning_rate": 9.98763258970744e-07, "loss": 0.036, "num_tokens": 3041835.0, "reward": 0.9166666865348816, "reward_std": 0.7350384593009949, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434515476227, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.00006902217865, "sampling/importance_sampling_ratio/min": 0.20887427031993866, "sampling/sampling_logp_difference/max": 1.5660227537155151, "sampling/sampling_logp_difference/mean": 0.011355048976838589, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 3178.0, "completions/mean_length": 2409.791748046875, "completions/mean_terminated_length": 2158.391357421875, "completions/min_length": 555.0, "completions/min_terminated_length": 555.0, "entropy": 0.1861964352428913, "epoch": 0.023672424824056303, "frac_reward_zero_std": 0.0, "grad_norm": 0.07348409069941825, "kl": 0.00013670947009813972, "learning_rate": 9.986916097482202e-07, "loss": 0.1344, "num_tokens": 3132990.0, "reward": 1.0, "reward_std": 0.8556844592094421, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6193567514419556, "sampling/importance_sampling_ratio/mean": 1.0000065565109253, "sampling/importance_sampling_ratio/min": 0.6730777025222778, "sampling/sampling_logp_difference/max": 0.4820289611816406, "sampling/sampling_logp_difference/mean": 0.005295700393617153, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5019.0, "completions/max_terminated_length": 5019.0, "completions/mean_length": 1754.2083740234375, "completions/mean_terminated_length": 1754.2083740234375, "completions/min_length": 809.0, "completions/min_terminated_length": 809.0, "entropy": 0.2570986822247505, "epoch": 0.02431222008957134, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1158583175137219, "kl": 0.00021523879695450887, "learning_rate": 9.986179458103035e-07, "loss": 0.1138, "num_tokens": 3189731.0, "reward": 1.0, "reward_std": 0.5605830550193787, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7601292133331299, "sampling/importance_sampling_ratio/mean": 0.9998245239257812, "sampling/importance_sampling_ratio/min": 0.5696007013320923, "sampling/sampling_logp_difference/max": 0.5653872489929199, "sampling/sampling_logp_difference/mean": 0.007862783037126064, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7095.0, "completions/max_terminated_length": 7095.0, "completions/mean_length": 2952.08349609375, "completions/mean_terminated_length": 2952.08349609375, "completions/min_length": 1302.0, "completions/min_terminated_length": 1302.0, "entropy": 0.2814563736319542, "epoch": 0.02495201535508637, "frac_reward_zero_std": 0.0, "grad_norm": 0.08637701389549418, "kl": 0.0002514226289349608, "learning_rate": 9.985422674545958e-07, "loss": 0.1073, "num_tokens": 3275989.0, "reward": 0.8333333730697632, "reward_std": 0.7781586050987244, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.6118911504745483, "sampling/importance_sampling_ratio/mean": 1.0001181364059448, "sampling/importance_sampling_ratio/min": 0.5883012413978577, "sampling/sampling_logp_difference/max": 0.5305161476135254, "sampling/sampling_logp_difference/mean": 0.008452240377664566, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3996.0, "completions/max_terminated_length": 3996.0, "completions/mean_length": 1787.9583740234375, "completions/mean_terminated_length": 1787.9583740234375, "completions/min_length": 768.0, "completions/min_terminated_length": 768.0, "entropy": 0.2618107236921787, "epoch": 0.025591810620601407, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.10502431480898372, "kl": 0.0002726635357248597, "learning_rate": 9.984645749868384e-07, "loss": 0.0436, "num_tokens": 3333524.0, "reward": 0.5416666865348816, "reward_std": 0.17251639068126678, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.3618656396865845, "sampling/importance_sampling_ratio/mean": 1.000091314315796, "sampling/importance_sampling_ratio/min": 0.5460015535354614, "sampling/sampling_logp_difference/max": 0.6051335334777832, "sampling/sampling_logp_difference/mean": 0.00727539649233222, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3130.0, "completions/max_terminated_length": 3130.0, "completions/mean_length": 1702.125, "completions/mean_terminated_length": 1702.125, "completions/min_length": 738.0, "completions/min_terminated_length": 738.0, "entropy": 0.35831476002931595, "epoch": 0.026231605886116442, "frac_reward_zero_std": 0.0, "grad_norm": 0.12852260178957023, "kl": 0.00034770762431435287, "learning_rate": 9.98384868720909e-07, "loss": 0.099, "num_tokens": 3385311.0, "reward": 1.375, "reward_std": 0.8224833011627197, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.3141894340515137, "sampling/importance_sampling_ratio/mean": 1.0001062154769897, "sampling/importance_sampling_ratio/min": 0.43822309374809265, "sampling/sampling_logp_difference/max": 0.8250272274017334, "sampling/sampling_logp_difference/mean": 0.010007518343627453, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4821.0, "completions/mean_length": 2212.75, "completions/mean_terminated_length": 1952.7825927734375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.3148401230573654, "epoch": 0.026871401151631478, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08212402324464808, "kl": 0.0003310905449325219, "learning_rate": 9.983031489788207e-07, "loss": 0.0894, "num_tokens": 3454065.0, "reward": 0.75, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6602191925048828, "sampling/importance_sampling_ratio/mean": 0.9998229146003723, "sampling/importance_sampling_ratio/min": 0.30113208293914795, "sampling/sampling_logp_difference/max": 1.2002062797546387, "sampling/sampling_logp_difference/mean": 0.009016720578074455, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5766.0, "completions/max_terminated_length": 5766.0, "completions/mean_length": 2719.916748046875, "completions/mean_terminated_length": 2719.916748046875, "completions/min_length": 960.0, "completions/min_terminated_length": 960.0, "entropy": 0.3531555086374283, "epoch": 0.027511196417146513, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.03333679092123049, "kl": 0.0002491454615665134, "learning_rate": 9.982194160907218e-07, "loss": -0.0345, "num_tokens": 3540583.0, "reward": 0.5416666865348816, "reward_std": 0.24800793826580048, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999919533729553, "sampling/importance_sampling_ratio/min": 0.2231961488723755, "sampling/sampling_logp_difference/max": 1.499704360961914, "sampling/sampling_logp_difference/mean": 0.010253618471324444, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7212.0, "completions/max_terminated_length": 7212.0, "completions/mean_length": 4132.20849609375, "completions/mean_terminated_length": 4132.20849609375, "completions/min_length": 1964.0, "completions/min_terminated_length": 1964.0, "entropy": 0.35301539301872253, "epoch": 0.02815099168266155, "frac_reward_zero_std": 0.0, "grad_norm": 0.07910494910461546, "kl": 0.0002448799532430712, "learning_rate": 9.981336703948933e-07, "loss": -0.0774, "num_tokens": 3654180.0, "reward": 0.875, "reward_std": 0.5383754968643188, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.6838886737823486, "sampling/importance_sampling_ratio/mean": 1.000016450881958, "sampling/importance_sampling_ratio/min": 0.5410737991333008, "sampling/sampling_logp_difference/max": 0.6141996383666992, "sampling/sampling_logp_difference/mean": 0.009832041338086128, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5525.0, "completions/max_terminated_length": 5525.0, "completions/mean_length": 2960.291748046875, "completions/mean_terminated_length": 2960.291748046875, "completions/min_length": 1316.0, "completions/min_terminated_length": 1316.0, "entropy": 0.3159840442240238, "epoch": 0.028790786948176585, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1040661553757635, "kl": 0.000253085028816713, "learning_rate": 9.980459122377483e-07, "loss": 0.082, "num_tokens": 3745075.0, "reward": 0.875, "reward_std": 0.6043562889099121, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.927878737449646, "sampling/importance_sampling_ratio/mean": 1.000001311302185, "sampling/importance_sampling_ratio/min": 0.24346290528774261, "sampling/sampling_logp_difference/max": 1.4127906560897827, "sampling/sampling_logp_difference/mean": 0.009086205624043941, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3945.0, "completions/max_terminated_length": 3945.0, "completions/mean_length": 2273.75, "completions/mean_terminated_length": 2273.75, "completions/min_length": 1569.0, "completions/min_terminated_length": 1569.0, "entropy": 0.20696231722831726, "epoch": 0.02943058221369162, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06831874439732835, "kl": 0.00022276454183156602, "learning_rate": 9.979561419738296e-07, "loss": 0.0068, "num_tokens": 3821269.0, "reward": 0.8333333730697632, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4818735122680664, "sampling/importance_sampling_ratio/mean": 0.9999757409095764, "sampling/importance_sampling_ratio/min": 0.695088267326355, "sampling/sampling_logp_difference/max": 0.3933071494102478, "sampling/sampling_logp_difference/mean": 0.00608034199103713, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7721.0, "completions/mean_length": 4035.58349609375, "completions/mean_terminated_length": 3854.86962890625, "completions/min_length": 1460.0, "completions/min_terminated_length": 1460.0, "entropy": 0.3822130858898163, "epoch": 0.030070377479206652, "frac_reward_zero_std": 0.0, "grad_norm": 0.07626195527078487, "kl": 0.00030238375620683655, "learning_rate": 9.978643599658094e-07, "loss": 0.1024, "num_tokens": 3930291.0, "reward": 0.4583333432674408, "reward_std": 0.7930537462234497, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5910881757736206, "sampling/importance_sampling_ratio/mean": 1.000034213066101, "sampling/importance_sampling_ratio/min": 0.37726184725761414, "sampling/sampling_logp_difference/max": 0.974815845489502, "sampling/sampling_logp_difference/mean": 0.010859820060431957, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7461.0, "completions/max_terminated_length": 7461.0, "completions/mean_length": 2677.416748046875, "completions/mean_terminated_length": 2677.416748046875, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.3917676582932472, "epoch": 0.030710172744721688, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10344572383196686, "kl": 0.0003679660949273966, "learning_rate": 9.977705665844874e-07, "loss": -0.0012, "num_tokens": 4006789.0, "reward": 0.875, "reward_std": 0.5930407047271729, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6344233751296997, "sampling/importance_sampling_ratio/mean": 1.0001460313796997, "sampling/importance_sampling_ratio/min": 0.5140222907066345, "sampling/sampling_logp_difference/max": 0.6654887199401855, "sampling/sampling_logp_difference/mean": 0.010439129546284676, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 6837.0, "completions/mean_length": 3393.291748046875, "completions/mean_terminated_length": 2707.761962890625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6973473131656647, "epoch": 0.03134996801023673, "frac_reward_zero_std": 0.0, "grad_norm": 0.08790325477098787, "kl": 0.0006604967202292755, "learning_rate": 9.976747622087889e-07, "loss": 0.1375, "num_tokens": 4101580.0, "reward": 1.0, "reward_std": 0.6804856061935425, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.3723112344741821, "sampling/importance_sampling_ratio/mean": 0.9998340606689453, "sampling/importance_sampling_ratio/min": 0.6079563498497009, "sampling/sampling_logp_difference/max": 0.49765217304229736, "sampling/sampling_logp_difference/mean": 0.010856474749743938, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4664.0, "completions/max_terminated_length": 4664.0, "completions/mean_length": 1780.666748046875, "completions/mean_terminated_length": 1780.666748046875, "completions/min_length": 731.0, "completions/min_terminated_length": 731.0, "entropy": 0.26041147485375404, "epoch": 0.03198976327575176, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.2743780656148781, "kl": 0.00032419701165053993, "learning_rate": 9.97576947225764e-07, "loss": -0.1581, "num_tokens": 4163140.0, "reward": 0.9166666865348816, "reward_std": 0.487678587436676, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.5278270244598389, "sampling/importance_sampling_ratio/mean": 0.9998288154602051, "sampling/importance_sampling_ratio/min": 0.19939342141151428, "sampling/sampling_logp_difference/max": 1.6124753952026367, "sampling/sampling_logp_difference/mean": 0.008223151788115501, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 1611.0833740234375, "completions/mean_terminated_length": 1611.0833740234375, "completions/min_length": 659.0, "completions/min_terminated_length": 659.0, "entropy": 0.17791056632995605, "epoch": 0.03262955854126679, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.054913344751678056, "kl": 0.00023579353728564456, "learning_rate": 9.974771220305853e-07, "loss": 0.0077, "num_tokens": 4225830.0, "reward": 1.4583333730697632, "reward_std": 0.17251639068126678, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 1.5520988702774048, "sampling/importance_sampling_ratio/mean": 1.0000356435775757, "sampling/importance_sampling_ratio/min": 0.46175700426101685, "sampling/sampling_logp_difference/max": 0.7727165222167969, "sampling/sampling_logp_difference/mean": 0.005265339277684689, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5813.0, "completions/max_terminated_length": 5813.0, "completions/mean_length": 3217.541748046875, "completions/mean_terminated_length": 3217.541748046875, "completions/min_length": 1394.0, "completions/min_terminated_length": 1394.0, "entropy": 0.26225052028894424, "epoch": 0.03326935380678183, "frac_reward_zero_std": 0.0, "grad_norm": 0.0854163063680107, "kl": 0.0002907448506448418, "learning_rate": 9.97375287026547e-07, "loss": 0.0249, "num_tokens": 4325283.0, "reward": 0.2916666865348816, "reward_std": 0.6439208984375, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.9385006427764893, "sampling/importance_sampling_ratio/mean": 0.999915361404419, "sampling/importance_sampling_ratio/min": 0.5689524412155151, "sampling/sampling_logp_difference/max": 0.6619148254394531, "sampling/sampling_logp_difference/mean": 0.007781282067298889, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5029.0, "completions/max_terminated_length": 5029.0, "completions/mean_length": 2137.70849609375, "completions/mean_terminated_length": 2137.70849609375, "completions/min_length": 746.0, "completions/min_terminated_length": 746.0, "entropy": 0.29937341809272766, "epoch": 0.03390914907229686, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09048005488639553, "kl": 0.0003177953985868953, "learning_rate": 9.97271442625063e-07, "loss": 0.0089, "num_tokens": 4389964.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.842655062675476, "sampling/importance_sampling_ratio/mean": 0.9998536109924316, "sampling/importance_sampling_ratio/min": 0.6232988238334656, "sampling/sampling_logp_difference/max": 0.6112074851989746, "sampling/sampling_logp_difference/mean": 0.008354097604751587, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5275.0, "completions/max_terminated_length": 5275.0, "completions/mean_length": 2391.541748046875, "completions/mean_terminated_length": 2391.541748046875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 0.24402432143688202, "epoch": 0.0345489443378119, "frac_reward_zero_std": 0.0, "grad_norm": 0.09658144928734438, "kl": 0.000338466175890062, "learning_rate": 9.971655892456645e-07, "loss": -0.158, "num_tokens": 4468105.0, "reward": 0.9583333730697632, "reward_std": 0.6621600985527039, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000156283378601, "sampling/importance_sampling_ratio/min": 0.46123942732810974, "sampling/sampling_logp_difference/max": 0.8082661628723145, "sampling/sampling_logp_difference/mean": 0.007414020597934723, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3202.0, "completions/max_terminated_length": 3202.0, "completions/mean_length": 1962.125, "completions/mean_terminated_length": 1962.125, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.24915092810988426, "epoch": 0.035188739603326934, "frac_reward_zero_std": 0.0, "grad_norm": 0.09799165977342911, "kl": 0.0003374768130015582, "learning_rate": 9.970577273159994e-07, "loss": 0.019, "num_tokens": 4534404.0, "reward": 0.7916666865348816, "reward_std": 0.7828061580657959, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7194894552230835, "sampling/importance_sampling_ratio/mean": 1.0002591609954834, "sampling/importance_sampling_ratio/min": 0.5126930475234985, "sampling/sampling_logp_difference/max": 0.6680779457092285, "sampling/sampling_logp_difference/mean": 0.0076023126021027565, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6921.0, "completions/max_terminated_length": 6921.0, "completions/mean_length": 2668.45849609375, "completions/mean_terminated_length": 2668.45849609375, "completions/min_length": 1173.0, "completions/min_terminated_length": 1173.0, "entropy": 0.314094141125679, "epoch": 0.03582853486884197, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08815827016456361, "kl": 0.00036068299232283607, "learning_rate": 9.969478572718307e-07, "loss": -0.0192, "num_tokens": 4615511.0, "reward": 1.25, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000020980834961, "sampling/importance_sampling_ratio/min": 0.349090039730072, "sampling/sampling_logp_difference/max": 1.0774736404418945, "sampling/sampling_logp_difference/mean": 0.00906948558986187, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3955.0, "completions/max_terminated_length": 3955.0, "completions/mean_length": 2398.291748046875, "completions/mean_terminated_length": 2398.291748046875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.2684570215642452, "epoch": 0.036468330134357005, "frac_reward_zero_std": 0.0, "grad_norm": 0.09039628473374696, "kl": 0.013376397462707246, "learning_rate": 9.968359795570331e-07, "loss": -0.055, "num_tokens": 4699286.0, "reward": 1.2083333730697632, "reward_std": 0.8587582111358643, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.4938322305679321, "sampling/importance_sampling_ratio/mean": 0.9998338222503662, "sampling/importance_sampling_ratio/min": 0.5032355189323425, "sampling/sampling_logp_difference/max": 0.6866970062255859, "sampling/sampling_logp_difference/mean": 0.006987412925809622, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 4955.0, "completions/mean_length": 2916.70849609375, "completions/mean_terminated_length": 2437.136474609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5437277108430862, "epoch": 0.037108125399872044, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.05770801982343429, "kl": 0.0005960082671663258, "learning_rate": 9.967220946235933e-07, "loss": -0.0663, "num_tokens": 4795895.0, "reward": 0.875, "reward_std": 0.3053751587867737, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5246386528015137, "sampling/importance_sampling_ratio/mean": 0.9999831318855286, "sampling/importance_sampling_ratio/min": 0.5283488035202026, "sampling/sampling_logp_difference/max": 0.6379985809326172, "sampling/sampling_logp_difference/mean": 0.00911279022693634, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3297.0, "completions/max_terminated_length": 3297.0, "completions/mean_length": 2131.541748046875, "completions/mean_terminated_length": 2131.541748046875, "completions/min_length": 1387.0, "completions/min_terminated_length": 1387.0, "entropy": 0.23722957074642181, "epoch": 0.037747920665387076, "frac_reward_zero_std": 0.0, "grad_norm": 0.09090817569518092, "kl": 0.0003500432721921243, "learning_rate": 9.966062029316064e-07, "loss": 0.0207, "num_tokens": 4865092.0, "reward": 1.125, "reward_std": 0.8079166412353516, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.4883588552474976, "sampling/importance_sampling_ratio/mean": 0.9998459815979004, "sampling/importance_sampling_ratio/min": 0.3166276216506958, "sampling/sampling_logp_difference/max": 1.150028944015503, "sampling/sampling_logp_difference/mean": 0.006660550367087126, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 5716.0, "completions/mean_length": 2655.166748046875, "completions/mean_terminated_length": 2151.818359375, "completions/min_length": 766.0, "completions/min_terminated_length": 766.0, "entropy": 0.24601248279213905, "epoch": 0.03838771593090211, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08762008798050593, "kl": 0.0003506199791445397, "learning_rate": 9.964883049492754e-07, "loss": 0.1136, "num_tokens": 4942880.0, "reward": 1.4166667461395264, "reward_std": 0.47364258766174316, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000101089477539, "sampling/importance_sampling_ratio/min": 0.49826285243034363, "sampling/sampling_logp_difference/max": 0.8027422428131104, "sampling/sampling_logp_difference/mean": 0.007630274631083012, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7319.0, "completions/max_terminated_length": 7319.0, "completions/mean_length": 3106.541748046875, "completions/mean_terminated_length": 3106.541748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.7158063426613808, "epoch": 0.03902751119641715, "frac_reward_zero_std": 0.0, "grad_norm": 0.09666089479325081, "kl": 0.0007185659487731755, "learning_rate": 9.963684011529082e-07, "loss": -0.1168, "num_tokens": 5040557.0, "reward": 0.6666666865348816, "reward_std": 0.6900655627250671, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000853538513184, "sampling/importance_sampling_ratio/min": 0.24968737363815308, "sampling/sampling_logp_difference/max": 1.3875457048416138, "sampling/sampling_logp_difference/mean": 0.009225002489984035, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6813.0, "completions/max_terminated_length": 6813.0, "completions/mean_length": 2963.45849609375, "completions/mean_terminated_length": 2963.45849609375, "completions/min_length": 1311.0, "completions/min_terminated_length": 1311.0, "entropy": 0.3135879933834076, "epoch": 0.03966730646193218, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08857557517183516, "kl": 0.00033343689574394375, "learning_rate": 9.962464920269168e-07, "loss": 0.0748, "num_tokens": 5128288.0, "reward": 1.4583333730697632, "reward_std": 0.6389504671096802, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998853802680969, "sampling/importance_sampling_ratio/min": 0.44534772634506226, "sampling/sampling_logp_difference/max": 0.9572093486785889, "sampling/sampling_logp_difference/mean": 0.008364109322428703, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7555.0, "completions/max_terminated_length": 7555.0, "completions/mean_length": 2536.416748046875, "completions/mean_terminated_length": 2536.416748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6435521617531776, "epoch": 0.04030710172744722, "frac_reward_zero_std": 0.0, "grad_norm": 0.09496365631615684, "kl": 0.0008523866854375228, "learning_rate": 9.961225780638138e-07, "loss": -0.0508, "num_tokens": 5199442.0, "reward": 1.125, "reward_std": 0.7366234064102173, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.4664933681488037, "sampling/importance_sampling_ratio/mean": 0.9998078346252441, "sampling/importance_sampling_ratio/min": 0.5934869050979614, "sampling/sampling_logp_difference/max": 0.521740198135376, "sampling/sampling_logp_difference/mean": 0.010100012645125389, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5069.0, "completions/max_terminated_length": 5069.0, "completions/mean_length": 2794.45849609375, "completions/mean_terminated_length": 2794.45849609375, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.43208979070186615, "epoch": 0.04094689699296225, "frac_reward_zero_std": 0.0, "grad_norm": 0.09118653916524687, "kl": 0.010960717198031489, "learning_rate": 9.959966597642125e-07, "loss": -0.0695, "num_tokens": 5287653.0, "reward": 0.7916666865348816, "reward_std": 0.6592972874641418, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.6871562004089355, "sampling/importance_sampling_ratio/mean": 1.000125765800476, "sampling/importance_sampling_ratio/min": 0.5456039905548096, "sampling/sampling_logp_difference/max": 0.6058619022369385, "sampling/sampling_logp_difference/mean": 0.00926181674003601, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5161.0, "completions/max_terminated_length": 5161.0, "completions/mean_length": 2762.25, "completions/mean_terminated_length": 2762.25, "completions/min_length": 1506.0, "completions/min_terminated_length": 1506.0, "entropy": 0.28833429142832756, "epoch": 0.04158669225847729, "frac_reward_zero_std": 0.0, "grad_norm": 0.10610261786597791, "kl": 0.0003800668055191636, "learning_rate": 9.95868737636823e-07, "loss": -0.0141, "num_tokens": 5371227.0, "reward": 0.7916666865348816, "reward_std": 0.865221381187439, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000101923942566, "sampling/importance_sampling_ratio/min": 0.39675387740135193, "sampling/sampling_logp_difference/max": 0.9244391918182373, "sampling/sampling_logp_difference/mean": 0.008464775048196316, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4722.0, "completions/max_terminated_length": 4722.0, "completions/mean_length": 2747.541748046875, "completions/mean_terminated_length": 2747.541748046875, "completions/min_length": 683.0, "completions/min_terminated_length": 683.0, "entropy": 0.24375587329268456, "epoch": 0.04222648752399232, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06214582166597877, "kl": 0.0004569664233713411, "learning_rate": 9.95738812198451e-07, "loss": 0.0287, "num_tokens": 5456808.0, "reward": 0.5, "reward_std": 0.5028601884841919, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.5501682758331299, "sampling/importance_sampling_ratio/mean": 0.9999876022338867, "sampling/importance_sampling_ratio/min": 0.5071703791618347, "sampling/sampling_logp_difference/max": 0.6789082288742065, "sampling/sampling_logp_difference/mean": 0.006796640809625387, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7020.0, "completions/mean_length": 4241.9169921875, "completions/mean_terminated_length": 3882.818359375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6302141100168228, "epoch": 0.04286628278950736, "frac_reward_zero_std": 0.0, "grad_norm": 0.09159159211344355, "kl": 0.0006433057933463715, "learning_rate": 9.956068839739953e-07, "loss": -0.0858, "num_tokens": 5575318.0, "reward": 0.3333333432674408, "reward_std": 0.5311707258224487, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998698830604553, "sampling/importance_sampling_ratio/min": 0.40052926540374756, "sampling/sampling_logp_difference/max": 0.9149684906005859, "sampling/sampling_logp_difference/mean": 0.012019122019410133, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4422.0, "completions/max_terminated_length": 4422.0, "completions/mean_length": 2377.875, "completions/mean_terminated_length": 2377.875, "completions/min_length": 1331.0, "completions/min_terminated_length": 1331.0, "entropy": 0.3372056484222412, "epoch": 0.04350607805502239, "frac_reward_zero_std": 0.0, "grad_norm": 0.10130539059546105, "kl": 0.0005126209653099068, "learning_rate": 9.954729534964467e-07, "loss": -0.0105, "num_tokens": 5653075.0, "reward": 0.625, "reward_std": 0.8296799063682556, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999935030937195, "sampling/importance_sampling_ratio/min": 0.36267775297164917, "sampling/sampling_logp_difference/max": 1.0142406225204468, "sampling/sampling_logp_difference/mean": 0.009269550442695618, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3045.0, "completions/max_terminated_length": 3045.0, "completions/mean_length": 1500.666748046875, "completions/mean_terminated_length": 1500.666748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.834724485874176, "epoch": 0.044145873320537425, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10662363330774134, "kl": 0.0011545464803930372, "learning_rate": 9.953370213068847e-07, "loss": -0.2455, "num_tokens": 5697115.0, "reward": 0.5, "reward_std": 0.5783516764640808, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.450461745262146, "sampling/importance_sampling_ratio/mean": 0.9999475479125977, "sampling/importance_sampling_ratio/min": 0.5584372878074646, "sampling/sampling_logp_difference/max": 0.5826129913330078, "sampling/sampling_logp_difference/mean": 0.010296277701854706, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6094.0, "completions/max_terminated_length": 6094.0, "completions/mean_length": 2567.166748046875, "completions/mean_terminated_length": 2567.166748046875, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.4044562354683876, "epoch": 0.044785668586052464, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10707821275124475, "kl": 0.0007553877730970271, "learning_rate": 9.951990879544753e-07, "loss": 0.048, "num_tokens": 5771007.0, "reward": 1.5833333730697632, "reward_std": 0.487678587436676, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.603109359741211, "sampling/importance_sampling_ratio/mean": 1.000125527381897, "sampling/importance_sampling_ratio/min": 0.4671770930290222, "sampling/sampling_logp_difference/max": 0.7610468864440918, "sampling/sampling_logp_difference/mean": 0.011606029234826565, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7264.0, "completions/max_terminated_length": 7264.0, "completions/mean_length": 3974.70849609375, "completions/mean_terminated_length": 3974.70849609375, "completions/min_length": 1117.0, "completions/min_terminated_length": 1117.0, "entropy": 0.5208060145378113, "epoch": 0.045425463851567496, "frac_reward_zero_std": 0.0, "grad_norm": 0.10373871920742946, "kl": 0.0004712201771326363, "learning_rate": 9.950591539964696e-07, "loss": -0.0203, "num_tokens": 5877744.0, "reward": 0.2916666865348816, "reward_std": 0.45032864809036255, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5821716785430908, "sampling/importance_sampling_ratio/mean": 1.0000628232955933, "sampling/importance_sampling_ratio/min": 0.5833076238632202, "sampling/sampling_logp_difference/max": 0.5390405654907227, "sampling/sampling_logp_difference/mean": 0.013574643060564995, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7604.0, "completions/max_terminated_length": 7604.0, "completions/mean_length": 3866.58349609375, "completions/mean_terminated_length": 3866.58349609375, "completions/min_length": 522.0, "completions/min_terminated_length": 522.0, "entropy": 0.5258466899394989, "epoch": 0.046065259117082535, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08387461719705497, "kl": 0.0005290417539072223, "learning_rate": 9.949172199982017e-07, "loss": 0.0648, "num_tokens": 5982686.0, "reward": 0.4583333432674408, "reward_std": 0.6504079103469849, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000544786453247, "sampling/importance_sampling_ratio/min": 0.5021117329597473, "sampling/sampling_logp_difference/max": 0.8021848201751709, "sampling/sampling_logp_difference/mean": 0.013551014475524426, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7362.0, "completions/max_terminated_length": 7362.0, "completions/mean_length": 3334.75, "completions/mean_terminated_length": 3334.75, "completions/min_length": 1372.0, "completions/min_terminated_length": 1372.0, "entropy": 0.31827980279922485, "epoch": 0.04670505438259757, "frac_reward_zero_std": 0.0, "grad_norm": 0.09355882190989258, "kl": 0.0005152645462658256, "learning_rate": 9.94773286533085e-07, "loss": 0.0677, "num_tokens": 6084032.0, "reward": 1.2083333730697632, "reward_std": 0.9318138360977173, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001453161239624, "sampling/importance_sampling_ratio/min": 0.07116857171058655, "sampling/sampling_logp_difference/max": 2.6427040100097656, "sampling/sampling_logp_difference/mean": 0.009224181063473225, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 4009.0, "completions/mean_length": 2832.70849609375, "completions/mean_terminated_length": 2345.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 1.0088451579213142, "epoch": 0.04734484964811261, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.05381736095488611, "kl": 0.0005837480566697195, "learning_rate": 9.946273541826109e-07, "loss": 0.0418, "num_tokens": 6182841.0, "reward": 0.5, "reward_std": 0.17817416787147522, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.9620736837387085, "sampling/importance_sampling_ratio/mean": 1.000083088874817, "sampling/importance_sampling_ratio/min": 0.4253072142601013, "sampling/sampling_logp_difference/max": 0.8549435138702393, "sampling/sampling_logp_difference/mean": 0.011555149219930172, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6670.0, "completions/max_terminated_length": 6670.0, "completions/mean_length": 2638.20849609375, "completions/mean_terminated_length": 2638.20849609375, "completions/min_length": 836.0, "completions/min_terminated_length": 836.0, "entropy": 0.2656707391142845, "epoch": 0.04798464491362764, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12390892714264536, "kl": 0.000490904065372888, "learning_rate": 9.944794235363467e-07, "loss": -0.1238, "num_tokens": 6262198.0, "reward": 1.0, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998758435249329, "sampling/importance_sampling_ratio/min": 0.407681405544281, "sampling/sampling_logp_difference/max": 0.8972692489624023, "sampling/sampling_logp_difference/mean": 0.008126875385642052, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3832.0, "completions/max_terminated_length": 3832.0, "completions/mean_length": 2196.70849609375, "completions/mean_terminated_length": 2196.70849609375, "completions/min_length": 1301.0, "completions/min_terminated_length": 1301.0, "entropy": 0.2378329038619995, "epoch": 0.04862444017914268, "frac_reward_zero_std": 0.0, "grad_norm": 0.09200396401634518, "kl": 0.0004541332455119118, "learning_rate": 9.943294951919325e-07, "loss": -0.0719, "num_tokens": 6333959.0, "reward": 1.0833333730697632, "reward_std": 0.6658527255058289, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.475268840789795, "sampling/importance_sampling_ratio/mean": 1.0000016689300537, "sampling/importance_sampling_ratio/min": 0.5592758059501648, "sampling/sampling_logp_difference/max": 0.5811125040054321, "sampling/sampling_logp_difference/mean": 0.007009579800069332, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4245.0, "completions/max_terminated_length": 4245.0, "completions/mean_length": 2060.58349609375, "completions/mean_terminated_length": 2060.58349609375, "completions/min_length": 1060.0, "completions/min_terminated_length": 1060.0, "entropy": 0.3827710822224617, "epoch": 0.04926423544465771, "frac_reward_zero_std": 0.0, "grad_norm": 0.1229520252708069, "kl": 0.0005053640707046725, "learning_rate": 9.941775697550795e-07, "loss": 0.0079, "num_tokens": 6397293.0, "reward": 0.9166666865348816, "reward_std": 0.6024982929229736, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.7446788549423218, "sampling/importance_sampling_ratio/mean": 1.000214695930481, "sampling/importance_sampling_ratio/min": 0.7081745266914368, "sampling/sampling_logp_difference/max": 0.5565705299377441, "sampling/sampling_logp_difference/mean": 0.010839438065886497, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6430.0, "completions/max_terminated_length": 6430.0, "completions/mean_length": 3451.33349609375, "completions/mean_terminated_length": 3451.33349609375, "completions/min_length": 1982.0, "completions/min_terminated_length": 1982.0, "entropy": 0.3066505640745163, "epoch": 0.04990403071017274, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.04995641276218854, "kl": 0.00040876057755667716, "learning_rate": 9.940236478395662e-07, "loss": -0.0237, "num_tokens": 6508973.0, "reward": 0.7916666865348816, "reward_std": 0.5930407047271729, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000206232070923, "sampling/importance_sampling_ratio/min": 0.6309257745742798, "sampling/sampling_logp_difference/max": 0.8501904010772705, "sampling/sampling_logp_difference/mean": 0.008827759884297848, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4199.0, "completions/max_terminated_length": 4199.0, "completions/mean_length": 2179.541748046875, "completions/mean_terminated_length": 2179.541748046875, "completions/min_length": 816.0, "completions/min_terminated_length": 816.0, "entropy": 0.28756125271320343, "epoch": 0.05054382597568778, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0808019753519182, "kl": 0.0005021094693802297, "learning_rate": 9.93867730067238e-07, "loss": -0.0132, "num_tokens": 6581010.0, "reward": 0.7083333730697632, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6096611022949219, "sampling/importance_sampling_ratio/mean": 1.0000375509262085, "sampling/importance_sampling_ratio/min": 0.666896641254425, "sampling/sampling_logp_difference/max": 0.47602367401123047, "sampling/sampling_logp_difference/mean": 0.008572056889533997, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6836.0, "completions/max_terminated_length": 6836.0, "completions/mean_length": 2735.08349609375, "completions/mean_terminated_length": 2735.08349609375, "completions/min_length": 968.0, "completions/min_terminated_length": 968.0, "entropy": 0.3764494061470032, "epoch": 0.05118362124120281, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08959937877048406, "kl": 0.0006145104998722672, "learning_rate": 9.937098170680032e-07, "loss": 0.0699, "num_tokens": 6661628.0, "reward": 1.375, "reward_std": 0.6258121728897095, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.662830114364624, "sampling/importance_sampling_ratio/mean": 1.000106930732727, "sampling/importance_sampling_ratio/min": 0.6653513312339783, "sampling/sampling_logp_difference/max": 0.5085210800170898, "sampling/sampling_logp_difference/mean": 0.010895302519202232, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5678.0, "completions/max_terminated_length": 5678.0, "completions/mean_length": 3335.541748046875, "completions/mean_terminated_length": 3335.541748046875, "completions/min_length": 1987.0, "completions/min_terminated_length": 1987.0, "entropy": 0.36196205765008926, "epoch": 0.05182341650671785, "frac_reward_zero_std": 0.0, "grad_norm": 0.09978365418579689, "kl": 0.000545060531294439, "learning_rate": 9.935499094798304e-07, "loss": 0.0711, "num_tokens": 6768033.0, "reward": 0.6666666865348816, "reward_std": 0.8893417716026306, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.5199538469314575, "sampling/importance_sampling_ratio/mean": 1.0001448392868042, "sampling/importance_sampling_ratio/min": 0.3757374584674835, "sampling/sampling_logp_difference/max": 0.9788646697998047, "sampling/sampling_logp_difference/mean": 0.010121184401214123, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7847.0, "completions/mean_length": 3572.041748046875, "completions/mean_terminated_length": 3371.174072265625, "completions/min_length": 1267.0, "completions/min_terminated_length": 1267.0, "entropy": 0.3967452570796013, "epoch": 0.052463211772232884, "frac_reward_zero_std": 0.0, "grad_norm": 0.09555419229652477, "kl": 0.0005535882519325241, "learning_rate": 9.933880079487467e-07, "loss": 0.094, "num_tokens": 6869434.0, "reward": 1.4166667461395264, "reward_std": 0.6863929629325867, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.7629855871200562, "sampling/importance_sampling_ratio/mean": 1.0001263618469238, "sampling/importance_sampling_ratio/min": 0.3730103075504303, "sampling/sampling_logp_difference/max": 0.9861493110656738, "sampling/sampling_logp_difference/mean": 0.010937748476862907, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7465.0, "completions/max_terminated_length": 7465.0, "completions/mean_length": 3324.125, "completions/mean_terminated_length": 3324.125, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.3541814833879471, "epoch": 0.053103007037747924, "frac_reward_zero_std": 0.0, "grad_norm": 0.10035423180310356, "kl": 0.0005903631536057219, "learning_rate": 9.93224113128835e-07, "loss": -0.0196, "num_tokens": 6973229.0, "reward": 0.4166666865348816, "reward_std": 0.7108919024467468, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.9074053764343262, "sampling/importance_sampling_ratio/mean": 0.9998669624328613, "sampling/importance_sampling_ratio/min": 0.4895213842391968, "sampling/sampling_logp_difference/max": 0.7143270969390869, "sampling/sampling_logp_difference/mean": 0.010029050521552563, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7747.0, "completions/mean_length": 3335.0, "completions/mean_terminated_length": 3123.826171875, "completions/min_length": 907.0, "completions/min_terminated_length": 907.0, "entropy": 0.3434460163116455, "epoch": 0.053742802303262956, "frac_reward_zero_std": 0.0, "grad_norm": 0.09616734779921429, "kl": 0.0006176212191348895, "learning_rate": 9.930582256822307e-07, "loss": -0.0698, "num_tokens": 7069781.0, "reward": 0.75, "reward_std": 0.5748276710510254, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001407861709595, "sampling/importance_sampling_ratio/min": 0.5366809964179993, "sampling/sampling_logp_difference/max": 1.0817384719848633, "sampling/sampling_logp_difference/mean": 0.010268326848745346, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5184.0, "completions/max_terminated_length": 5184.0, "completions/mean_length": 3065.25, "completions/mean_terminated_length": 3065.25, "completions/min_length": 1716.0, "completions/min_terminated_length": 1716.0, "entropy": 0.25898754596710205, "epoch": 0.05438259756877799, "frac_reward_zero_std": 0.0, "grad_norm": 0.09247848295544563, "kl": 0.0006613866426050663, "learning_rate": 9.928903462791194e-07, "loss": -0.0208, "num_tokens": 7163531.0, "reward": 0.5416666865348816, "reward_std": 0.697779655456543, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999992251396179, "sampling/importance_sampling_ratio/min": 0.5630877017974854, "sampling/sampling_logp_difference/max": 0.7375435829162598, "sampling/sampling_logp_difference/mean": 0.00803414173424244, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3621.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 2297.541748046875, "completions/mean_terminated_length": 2297.541748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.2463398054242134, "epoch": 0.05502239283429303, "frac_reward_zero_std": 0.0, "grad_norm": 0.09743072219935084, "kl": 0.025928871938958764, "learning_rate": 9.92720475597734e-07, "loss": -0.0857, "num_tokens": 7242000.0, "reward": 0.7916666865348816, "reward_std": 0.8852130770683289, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000147819519043, "sampling/importance_sampling_ratio/min": 0.4826893210411072, "sampling/sampling_logp_difference/max": 0.9806022644042969, "sampling/sampling_logp_difference/mean": 0.006808350794017315, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5301.0, "completions/max_terminated_length": 5301.0, "completions/mean_length": 2842.166748046875, "completions/mean_terminated_length": 2842.166748046875, "completions/min_length": 1324.0, "completions/min_terminated_length": 1324.0, "entropy": 0.25885526463389397, "epoch": 0.05566218809980806, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06752903284069191, "kl": 0.0005368091660784557, "learning_rate": 9.925486143243532e-07, "loss": 0.0218, "num_tokens": 7328580.0, "reward": 0.2083333432674408, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5578653812408447, "sampling/importance_sampling_ratio/mean": 1.0000576972961426, "sampling/importance_sampling_ratio/min": 0.45132291316986084, "sampling/sampling_logp_difference/max": 0.7955722808837891, "sampling/sampling_logp_difference/mean": 0.0074488259851932526, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4126.0, "completions/mean_length": 2881.75, "completions/mean_terminated_length": 2650.86962890625, "completions/min_length": 1348.0, "completions/min_terminated_length": 1348.0, "entropy": 0.29125165939331055, "epoch": 0.0563019833653231, "frac_reward_zero_std": 0.0, "grad_norm": 0.14784273753555507, "kl": 0.0006449329375755042, "learning_rate": 9.923747631532967e-07, "loss": 0.1861, "num_tokens": 7418654.0, "reward": 1.125, "reward_std": 0.8180223703384399, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.806158185005188, "sampling/importance_sampling_ratio/mean": 1.000097393989563, "sampling/importance_sampling_ratio/min": 0.2986098527908325, "sampling/sampling_logp_difference/max": 1.2086174488067627, "sampling/sampling_logp_difference/mean": 0.008500674739480019, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7490.0, "completions/mean_length": 3045.08349609375, "completions/mean_terminated_length": 2821.304443359375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.32459670305252075, "epoch": 0.05694177863083813, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09034175516414802, "kl": 0.024205586392781697, "learning_rate": 9.921989227869236e-07, "loss": 0.0916, "num_tokens": 7508544.0, "reward": 0.9166666865348816, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.5137299299240112, "sampling/importance_sampling_ratio/mean": 1.0000594854354858, "sampling/importance_sampling_ratio/min": 0.5252759456634521, "sampling/sampling_logp_difference/max": 0.6438314914703369, "sampling/sampling_logp_difference/mean": 0.00899562332779169, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3335.0, "completions/max_terminated_length": 3335.0, "completions/mean_length": 1855.25, "completions/mean_terminated_length": 1855.25, "completions/min_length": 1054.0, "completions/min_terminated_length": 1054.0, "entropy": 0.18844327703118324, "epoch": 0.05758157389635317, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.036686739153080024, "kl": 0.0006588037649635226, "learning_rate": 9.920210939356294e-07, "loss": 0.0168, "num_tokens": 7573038.0, "reward": 1.9166667461395264, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.9583333134651184, "rewards/accuracy_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.3634922504425049, "sampling/importance_sampling_ratio/mean": 0.999864399433136, "sampling/importance_sampling_ratio/min": 0.624430775642395, "sampling/sampling_logp_difference/max": 0.4709148406982422, "sampling/sampling_logp_difference/mean": 0.0058212000876665115, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4752.0, "completions/max_terminated_length": 4752.0, "completions/mean_length": 2695.375, "completions/mean_terminated_length": 2695.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5460686609148979, "epoch": 0.0582213691618682, "frac_reward_zero_std": 0.0, "grad_norm": 0.11585571160367933, "kl": 0.0016939938650466502, "learning_rate": 9.918412773178429e-07, "loss": -0.0493, "num_tokens": 7653495.0, "reward": 0.5833333730697632, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6324069499969482, "sampling/importance_sampling_ratio/mean": 1.0002412796020508, "sampling/importance_sampling_ratio/min": 0.4800148606300354, "sampling/sampling_logp_difference/max": 0.7339382171630859, "sampling/sampling_logp_difference/mean": 0.01028500311076641, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4930.0, "completions/mean_length": 2503.791748046875, "completions/mean_terminated_length": 2256.478271484375, "completions/min_length": 825.0, "completions/min_terminated_length": 825.0, "entropy": 0.3159922584891319, "epoch": 0.05886116442738324, "frac_reward_zero_std": 0.0, "grad_norm": 0.10528643839415038, "kl": 0.0007275855605257675, "learning_rate": 9.916594736600242e-07, "loss": -0.0045, "num_tokens": 7728650.0, "reward": 0.75, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000230312347412, "sampling/importance_sampling_ratio/min": 0.3813911974430084, "sampling/sampling_logp_difference/max": 0.9639296531677246, "sampling/sampling_logp_difference/mean": 0.009298719465732574, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5416.0, "completions/mean_length": 2699.125, "completions/mean_terminated_length": 2460.304443359375, "completions/min_length": 1320.0, "completions/min_terminated_length": 1320.0, "entropy": 0.275405865162611, "epoch": 0.05950095969289827, "frac_reward_zero_std": 0.0, "grad_norm": 0.09399019861788217, "kl": 0.0006654523313045502, "learning_rate": 9.914756836966604e-07, "loss": 0.1854, "num_tokens": 7811709.0, "reward": 1.0416667461395264, "reward_std": 0.7714905738830566, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5752983093261719, "sampling/importance_sampling_ratio/mean": 1.0000163316726685, "sampling/importance_sampling_ratio/min": 0.5439392924308777, "sampling/sampling_logp_difference/max": 0.6089177131652832, "sampling/sampling_logp_difference/mean": 0.008419802412390709, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4780.0, "completions/max_terminated_length": 4780.0, "completions/mean_length": 2660.20849609375, "completions/mean_terminated_length": 2660.20849609375, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "entropy": 0.44148601591587067, "epoch": 0.060140754958413305, "frac_reward_zero_std": 0.0, "grad_norm": 0.2841018422942628, "kl": 0.00107448166818358, "learning_rate": 9.912899081702633e-07, "loss": 0.0087, "num_tokens": 7886098.0, "reward": 0.9583333730697632, "reward_std": 0.8243736028671265, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.7356716394424438, "sampling/importance_sampling_ratio/mean": 0.9997854232788086, "sampling/importance_sampling_ratio/min": 0.4372701048851013, "sampling/sampling_logp_difference/max": 0.8272042274475098, "sampling/sampling_logp_difference/mean": 0.011900268495082855, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3905.0, "completions/max_terminated_length": 3905.0, "completions/mean_length": 1981.5, "completions/mean_terminated_length": 1981.5, "completions/min_length": 1254.0, "completions/min_terminated_length": 1254.0, "entropy": 0.3126012533903122, "epoch": 0.060780550223928344, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09777517657869089, "kl": 0.000753736196202226, "learning_rate": 9.911021478313664e-07, "loss": -0.0328, "num_tokens": 7951798.0, "reward": 1.1666667461395264, "reward_std": 0.45069071650505066, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.385780692100525, "sampling/importance_sampling_ratio/mean": 0.9999410510063171, "sampling/importance_sampling_ratio/min": 0.5705700516700745, "sampling/sampling_logp_difference/max": 0.5611193180084229, "sampling/sampling_logp_difference/mean": 0.008859032765030861, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3853.0, "completions/max_terminated_length": 3853.0, "completions/mean_length": 2475.125, "completions/mean_terminated_length": 2475.125, "completions/min_length": 1333.0, "completions/min_terminated_length": 1333.0, "entropy": 0.34115058928728104, "epoch": 0.061420345489443376, "frac_reward_zero_std": 0.0, "grad_norm": 0.11933821956576103, "kl": 0.0007628337625646964, "learning_rate": 9.909124034385224e-07, "loss": -0.0663, "num_tokens": 8025305.0, "reward": 0.375, "reward_std": 0.660194993019104, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.6367162466049194, "sampling/importance_sampling_ratio/mean": 0.9999073147773743, "sampling/importance_sampling_ratio/min": 0.27867791056632996, "sampling/sampling_logp_difference/max": 1.2776985168457031, "sampling/sampling_logp_difference/mean": 0.009546327404677868, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3305.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 1762.291748046875, "completions/mean_terminated_length": 1762.291748046875, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "entropy": 0.26064635068178177, "epoch": 0.062060140754958415, "frac_reward_zero_std": 0.0, "grad_norm": 0.11375125908412936, "kl": 0.0007656551897525787, "learning_rate": 9.907206757582984e-07, "loss": 0.0056, "num_tokens": 8079256.0, "reward": 1.1666667461395264, "reward_std": 0.754700779914856, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.6926578283309937, "sampling/importance_sampling_ratio/mean": 0.9999468922615051, "sampling/importance_sampling_ratio/min": 0.6292046308517456, "sampling/sampling_logp_difference/max": 0.5262999534606934, "sampling/sampling_logp_difference/mean": 0.007373776286840439, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3727.0, "completions/max_terminated_length": 3727.0, "completions/mean_length": 2097.5, "completions/mean_terminated_length": 2097.5, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.37176523357629776, "epoch": 0.06269993602047345, "frac_reward_zero_std": 0.0, "grad_norm": 0.1039498738558115, "kl": 0.004007546798675321, "learning_rate": 9.905269655652757e-07, "loss": -0.0037, "num_tokens": 8142860.0, "reward": 1.375, "reward_std": 0.6285028457641602, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.3913205862045288, "sampling/importance_sampling_ratio/mean": 0.9998879432678223, "sampling/importance_sampling_ratio/min": 0.633051872253418, "sampling/sampling_logp_difference/max": 0.4572029113769531, "sampling/sampling_logp_difference/mean": 0.008336901664733887, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3805.0, "completions/max_terminated_length": 3805.0, "completions/mean_length": 1729.916748046875, "completions/mean_terminated_length": 1729.916748046875, "completions/min_length": 480.0, "completions/min_terminated_length": 480.0, "entropy": 0.2790737636387348, "epoch": 0.06333973128598848, "frac_reward_zero_std": 0.0, "grad_norm": 0.10755437517464145, "kl": 0.00095232596504502, "learning_rate": 9.90331273642043e-07, "loss": 0.098, "num_tokens": 8200290.0, "reward": 1.0833333730697632, "reward_std": 0.7385624647140503, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4418771266937256, "sampling/importance_sampling_ratio/mean": 1.0000568628311157, "sampling/importance_sampling_ratio/min": 0.6049904823303223, "sampling/sampling_logp_difference/max": 0.5025424957275391, "sampling/sampling_logp_difference/mean": 0.008825618773698807, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5417.0, "completions/max_terminated_length": 5417.0, "completions/mean_length": 2337.916748046875, "completions/mean_terminated_length": 2337.916748046875, "completions/min_length": 487.0, "completions/min_terminated_length": 487.0, "entropy": 0.3962782695889473, "epoch": 0.06397952655150352, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11272458913487528, "kl": 0.0009024297614814714, "learning_rate": 9.90133600779197e-07, "loss": -0.1214, "num_tokens": 8268680.0, "reward": 0.875, "reward_std": 0.3698274493217468, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.7755801677703857, "sampling/importance_sampling_ratio/mean": 0.999931812286377, "sampling/importance_sampling_ratio/min": 0.41345590353012085, "sampling/sampling_logp_difference/max": 0.883204460144043, "sampling/sampling_logp_difference/mean": 0.011491885408759117, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7580.0, "completions/max_terminated_length": 7580.0, "completions/mean_length": 2363.625, "completions/mean_terminated_length": 2363.625, "completions/min_length": 1108.0, "completions/min_terminated_length": 1108.0, "entropy": 0.26323822885751724, "epoch": 0.06461932181701856, "frac_reward_zero_std": 0.0, "grad_norm": 0.10169434170809571, "kl": 0.000861114080180414, "learning_rate": 9.899339477753359e-07, "loss": 0.0596, "num_tokens": 8346383.0, "reward": 0.8333333730697632, "reward_std": 0.7957234382629395, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000249147415161, "sampling/importance_sampling_ratio/min": 0.3817090392112732, "sampling/sampling_logp_difference/max": 0.9630966186523438, "sampling/sampling_logp_difference/mean": 0.008059553802013397, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 5617.0, "completions/mean_length": 3848.08349609375, "completions/mean_terminated_length": 3227.52392578125, "completions/min_length": 1523.0, "completions/min_terminated_length": 1523.0, "entropy": 0.42456384748220444, "epoch": 0.06525911708253358, "frac_reward_zero_std": 0.0, "grad_norm": 0.09311662825446881, "kl": 0.0009113708510994911, "learning_rate": 9.897323154370589e-07, "loss": 0.0222, "num_tokens": 8452689.0, "reward": 0.4166666865348816, "reward_std": 0.6582559943199158, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000585317611694, "sampling/importance_sampling_ratio/min": 0.44077423214912415, "sampling/sampling_logp_difference/max": 0.8511848449707031, "sampling/sampling_logp_difference/mean": 0.012198364362120628, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7371.0, "completions/mean_length": 3604.041748046875, "completions/mean_terminated_length": 3404.565185546875, "completions/min_length": 1025.0, "completions/min_terminated_length": 1025.0, "entropy": 0.4215092658996582, "epoch": 0.06589891234804862, "frac_reward_zero_std": 0.0, "grad_norm": 0.10148745507447068, "kl": 0.0008012275793589652, "learning_rate": 9.895287045789607e-07, "loss": 0.0226, "num_tokens": 8559290.0, "reward": 0.8333333730697632, "reward_std": 0.7709748148918152, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.3752916157245636, "sampling/sampling_logp_difference/max": 1.3373374938964844, "sampling/sampling_logp_difference/mean": 0.011722523719072342, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7706.0, "completions/max_terminated_length": 7706.0, "completions/mean_length": 3873.58349609375, "completions/mean_terminated_length": 3873.58349609375, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "entropy": 0.38763078302145004, "epoch": 0.06653870761356366, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06995468305624498, "kl": 0.0008780086500337347, "learning_rate": 9.8932311602363e-07, "loss": -0.0222, "num_tokens": 8670112.0, "reward": 0.5, "reward_std": 0.5311707258224487, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000258684158325, "sampling/importance_sampling_ratio/min": 0.4419771134853363, "sampling/sampling_logp_difference/max": 0.9970874786376953, "sampling/sampling_logp_difference/mean": 0.010420046746730804, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4451.0, "completions/max_terminated_length": 4451.0, "completions/mean_length": 2985.416748046875, "completions/mean_terminated_length": 2985.416748046875, "completions/min_length": 1440.0, "completions/min_terminated_length": 1440.0, "entropy": 0.4032706171274185, "epoch": 0.0671785028790787, "frac_reward_zero_std": 0.0, "grad_norm": 0.09159874511310477, "kl": 0.0009294050541939214, "learning_rate": 9.89115550601645e-07, "loss": 0.0375, "num_tokens": 8761546.0, "reward": 0.5833333730697632, "reward_std": 0.5970091223716736, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000045895576477, "sampling/importance_sampling_ratio/min": 0.5233864188194275, "sampling/sampling_logp_difference/max": 0.7262942790985107, "sampling/sampling_logp_difference/mean": 0.01129382848739624, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4175.0, "completions/max_terminated_length": 4175.0, "completions/mean_length": 2074.416748046875, "completions/mean_terminated_length": 2074.416748046875, "completions/min_length": 666.0, "completions/min_terminated_length": 666.0, "entropy": 0.2526097558438778, "epoch": 0.06781829814459372, "frac_reward_zero_std": 0.0, "grad_norm": 0.12629766875760456, "kl": 0.0009086456411750987, "learning_rate": 9.889060091515707e-07, "loss": -0.1519, "num_tokens": 8835468.0, "reward": 0.5416666865348816, "reward_std": 0.6628212928771973, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.4324772357940674, "sampling/importance_sampling_ratio/mean": 1.0000720024108887, "sampling/importance_sampling_ratio/min": 0.44312426447868347, "sampling/sampling_logp_difference/max": 0.8139050006866455, "sampling/sampling_logp_difference/mean": 0.007927043363451958, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6753.0, "completions/max_terminated_length": 6753.0, "completions/mean_length": 3481.20849609375, "completions/mean_terminated_length": 3481.20849609375, "completions/min_length": 1236.0, "completions/min_terminated_length": 1236.0, "entropy": 0.33701302111148834, "epoch": 0.06845809341010876, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09052795939509438, "kl": 0.0008564841700717807, "learning_rate": 9.886944925199549e-07, "loss": 0.0417, "num_tokens": 8935121.0, "reward": 0.6666666865348816, "reward_std": 0.5039526224136353, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.9053577184677124, "sampling/importance_sampling_ratio/mean": 0.9998910427093506, "sampling/importance_sampling_ratio/min": 0.3372038006782532, "sampling/sampling_logp_difference/max": 1.0870678424835205, "sampling/sampling_logp_difference/mean": 0.009490902535617352, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3563.0, "completions/max_terminated_length": 3563.0, "completions/mean_length": 2048.95849609375, "completions/mean_terminated_length": 2048.95849609375, "completions/min_length": 1233.0, "completions/min_terminated_length": 1233.0, "entropy": 0.25819555670022964, "epoch": 0.0690978886756238, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07476823498367897, "kl": 0.0009461005247430876, "learning_rate": 9.884810015613253e-07, "loss": -0.0081, "num_tokens": 9000776.0, "reward": 0.7083333730697632, "reward_std": 0.6258121728897095, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000121593475342, "sampling/importance_sampling_ratio/min": 0.6086333990097046, "sampling/sampling_logp_difference/max": 0.7612214088439941, "sampling/sampling_logp_difference/mean": 0.007934361696243286, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6749.0, "completions/max_terminated_length": 6749.0, "completions/mean_length": 2657.041748046875, "completions/mean_terminated_length": 2657.041748046875, "completions/min_length": 1153.0, "completions/min_terminated_length": 1153.0, "entropy": 0.30035800859332085, "epoch": 0.06973768394113884, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.04831018266869856, "kl": 0.0009174856240861118, "learning_rate": 9.88265537138186e-07, "loss": 0.0631, "num_tokens": 9078433.0, "reward": 0.375, "reward_std": 0.1178511306643486, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.5966262817382812, "sampling/importance_sampling_ratio/mean": 1.0000253915786743, "sampling/importance_sampling_ratio/min": 0.41211065649986267, "sampling/sampling_logp_difference/max": 0.8864634037017822, "sampling/sampling_logp_difference/mean": 0.009145185351371765, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5616.0, "completions/max_terminated_length": 5616.0, "completions/mean_length": 2924.875, "completions/mean_terminated_length": 2924.875, "completions/min_length": 1559.0, "completions/min_terminated_length": 1559.0, "entropy": 0.3366604894399643, "epoch": 0.07037747920665387, "frac_reward_zero_std": 0.0, "grad_norm": 0.08366411196705088, "kl": 0.0009730554884299636, "learning_rate": 9.88048100121014e-07, "loss": 0.1292, "num_tokens": 9170878.0, "reward": 0.8333333730697632, "reward_std": 0.7291311025619507, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7978906631469727, "sampling/importance_sampling_ratio/mean": 1.000046968460083, "sampling/importance_sampling_ratio/min": 0.582000732421875, "sampling/sampling_logp_difference/max": 0.5866141319274902, "sampling/sampling_logp_difference/mean": 0.009497575461864471, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8171.0, "completions/max_terminated_length": 8171.0, "completions/mean_length": 3030.25, "completions/mean_terminated_length": 3030.25, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.6573808714747429, "epoch": 0.0710172744721689, "frac_reward_zero_std": 0.0, "grad_norm": 0.11194531943707728, "kl": 0.0014857540954835713, "learning_rate": 9.878286913882552e-07, "loss": 0.1144, "num_tokens": 9255908.0, "reward": 0.8333333730697632, "reward_std": 0.700171172618866, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.5442315340042114, "sampling/importance_sampling_ratio/mean": 1.0000063180923462, "sampling/importance_sampling_ratio/min": 0.5115975737571716, "sampling/sampling_logp_difference/max": 0.6702170372009277, "sampling/sampling_logp_difference/mean": 0.012853170745074749, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5931.0, "completions/max_terminated_length": 5931.0, "completions/mean_length": 3121.95849609375, "completions/mean_terminated_length": 3121.95849609375, "completions/min_length": 1780.0, "completions/min_terminated_length": 1780.0, "entropy": 0.44486841559410095, "epoch": 0.07165706973768395, "frac_reward_zero_std": 0.0, "grad_norm": 0.11240665184790534, "kl": 0.0011569808411877602, "learning_rate": 9.876073118263215e-07, "loss": -0.0178, "num_tokens": 9345579.0, "reward": 0.4583333432674408, "reward_std": 0.5787960290908813, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000015497207642, "sampling/importance_sampling_ratio/min": 0.3507997393608093, "sampling/sampling_logp_difference/max": 1.0475397109985352, "sampling/sampling_logp_difference/mean": 0.011629275977611542, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3864.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 2268.45849609375, "completions/mean_terminated_length": 2268.45849609375, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 0.2556172050535679, "epoch": 0.07229686500319897, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09798000193742777, "kl": 0.0009766742441570386, "learning_rate": 9.873839623295868e-07, "loss": -0.005, "num_tokens": 9419862.0, "reward": 0.9166666865348816, "reward_std": 0.38613972067832947, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.755584955215454, "sampling/importance_sampling_ratio/mean": 0.9998075366020203, "sampling/importance_sampling_ratio/min": 0.5049359798431396, "sampling/sampling_logp_difference/max": 0.6833236217498779, "sampling/sampling_logp_difference/mean": 0.007557287812232971, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4676.0, "completions/max_terminated_length": 4676.0, "completions/mean_length": 1950.75, "completions/mean_terminated_length": 1950.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39119984954595566, "epoch": 0.07293666026871401, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10308450313422642, "kl": 0.010096332829562016, "learning_rate": 9.87158643800384e-07, "loss": -0.0497, "num_tokens": 9480728.0, "reward": 1.375, "reward_std": 0.6008435487747192, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5134928226470947, "sampling/importance_sampling_ratio/mean": 1.0000859498977661, "sampling/importance_sampling_ratio/min": 0.7568961977958679, "sampling/sampling_logp_difference/max": 0.41442012786865234, "sampling/sampling_logp_difference/mean": 0.00882451981306076, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3984.0, "completions/max_terminated_length": 3984.0, "completions/mean_length": 2484.291748046875, "completions/mean_terminated_length": 2484.291748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.39162763208150864, "epoch": 0.07357645553422905, "frac_reward_zero_std": 0.0, "grad_norm": 0.09720359687102405, "kl": 0.007065296813379973, "learning_rate": 9.86931357149e-07, "loss": -0.1137, "num_tokens": 9563951.0, "reward": 0.5, "reward_std": 0.8079428672790527, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.4018052816390991, "sampling/importance_sampling_ratio/mean": 0.9998170733451843, "sampling/importance_sampling_ratio/min": 0.6531490087509155, "sampling/sampling_logp_difference/max": 0.4259500503540039, "sampling/sampling_logp_difference/mean": 0.008480212651193142, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7755.0, "completions/mean_length": 4240.375, "completions/mean_terminated_length": 3675.857177734375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.5785017311573029, "epoch": 0.07421625079974409, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0662011362048105, "kl": 0.005218207603320479, "learning_rate": 9.86702103293674e-07, "loss": 0.0418, "num_tokens": 9683168.0, "reward": 0.375, "reward_std": 0.21362332999706268, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999780654907227, "sampling/importance_sampling_ratio/min": 0.5563015341758728, "sampling/sampling_logp_difference/max": 1.0558280944824219, "sampling/sampling_logp_difference/mean": 0.013546700589358807, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5154.0, "completions/mean_length": 3155.625, "completions/mean_terminated_length": 2936.65234375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "entropy": 0.40898968279361725, "epoch": 0.07485604606525911, "frac_reward_zero_std": 0.0, "grad_norm": 0.08801905393425413, "kl": 0.0011938214884139597, "learning_rate": 9.864708831605919e-07, "loss": 0.0518, "num_tokens": 9773919.0, "reward": 1.1666667461395264, "reward_std": 0.8893417716026306, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.589686632156372, "sampling/importance_sampling_ratio/mean": 1.00009024143219, "sampling/importance_sampling_ratio/min": 0.4031662940979004, "sampling/sampling_logp_difference/max": 0.908406138420105, "sampling/sampling_logp_difference/mean": 0.01078998576849699, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5308.0, "completions/max_terminated_length": 5308.0, "completions/mean_length": 2458.33349609375, "completions/mean_terminated_length": 2458.33349609375, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "entropy": 0.430414117872715, "epoch": 0.07549584133077415, "frac_reward_zero_std": 0.0, "grad_norm": 0.1225820167231431, "kl": 0.0017254001868423074, "learning_rate": 9.862376976838834e-07, "loss": -0.2044, "num_tokens": 9847487.0, "reward": 0.625, "reward_std": 0.5794824361801147, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5210131406784058, "sampling/importance_sampling_ratio/mean": 0.9998309016227722, "sampling/importance_sampling_ratio/min": 0.5816840529441833, "sampling/sampling_logp_difference/max": 0.5418277978897095, "sampling/sampling_logp_difference/mean": 0.011245353147387505, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6043.0, "completions/max_terminated_length": 6043.0, "completions/mean_length": 2599.08349609375, "completions/mean_terminated_length": 2599.08349609375, "completions/min_length": 21.0, "completions/min_terminated_length": 21.0, "entropy": 0.45592279732227325, "epoch": 0.07613563659628919, "frac_reward_zero_std": 0.0, "grad_norm": 0.10597826861134031, "kl": 0.0013507178809959441, "learning_rate": 9.86002547805619e-07, "loss": 0.0377, "num_tokens": 9923249.0, "reward": 1.0, "reward_std": 0.6741900444030762, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.6321262121200562, "sampling/importance_sampling_ratio/mean": 0.9999706745147705, "sampling/importance_sampling_ratio/min": 0.3205128014087677, "sampling/sampling_logp_difference/max": 1.1378331184387207, "sampling/sampling_logp_difference/mean": 0.010364701971411705, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5145.0, "completions/max_terminated_length": 5145.0, "completions/mean_length": 2682.95849609375, "completions/mean_terminated_length": 2682.95849609375, "completions/min_length": 1469.0, "completions/min_terminated_length": 1469.0, "entropy": 0.3290299251675606, "epoch": 0.07677543186180422, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08261193842376828, "kl": 0.0009449862700421363, "learning_rate": 9.857654344758044e-07, "loss": -0.0037, "num_tokens": 10010008.0, "reward": 0.5, "reward_std": 0.4497717618942261, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.5245401859283447, "sampling/importance_sampling_ratio/mean": 0.9999576210975647, "sampling/importance_sampling_ratio/min": 0.33151915669441223, "sampling/sampling_logp_difference/max": 1.104069709777832, "sampling/sampling_logp_difference/mean": 0.009344580583274364, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7677.0, "completions/max_terminated_length": 7677.0, "completions/mean_length": 3268.291748046875, "completions/mean_terminated_length": 3268.291748046875, "completions/min_length": 12.0, "completions/min_terminated_length": 12.0, "entropy": 0.44132353365421295, "epoch": 0.07741522712731926, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0922838846015718, "kl": 0.0015768093435326591, "learning_rate": 9.85526358652378e-07, "loss": -0.1375, "num_tokens": 10104983.0, "reward": 0.4583333432674408, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.9969881772994995, "sampling/importance_sampling_ratio/mean": 1.0000234842300415, "sampling/importance_sampling_ratio/min": 0.2884334325790405, "sampling/sampling_logp_difference/max": 1.243290901184082, "sampling/sampling_logp_difference/mean": 0.009998096153140068, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7220.0, "completions/mean_length": 4185.1669921875, "completions/mean_terminated_length": 4010.95654296875, "completions/min_length": 2396.0, "completions/min_terminated_length": 2396.0, "entropy": 0.26573725417256355, "epoch": 0.0780550223928343, "frac_reward_zero_std": 0.0, "grad_norm": 0.1444067793721764, "kl": 0.0008259995956905186, "learning_rate": 9.852853213012072e-07, "loss": 0.0854, "num_tokens": 10230827.0, "reward": 1.1666667461395264, "reward_std": 0.6321948766708374, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.8678396940231323, "sampling/importance_sampling_ratio/mean": 1.0000606775283813, "sampling/importance_sampling_ratio/min": 0.5465539693832397, "sampling/sampling_logp_difference/max": 0.6247825622558594, "sampling/sampling_logp_difference/mean": 0.007872618734836578, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6556.0, "completions/max_terminated_length": 6556.0, "completions/mean_length": 2847.58349609375, "completions/mean_terminated_length": 2847.58349609375, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 0.27319880574941635, "epoch": 0.07869481765834933, "frac_reward_zero_std": 0.0, "grad_norm": 0.09334944177008621, "kl": 0.0010344568581786007, "learning_rate": 9.85042323396083e-07, "loss": -0.0288, "num_tokens": 10316185.0, "reward": 0.9583333730697632, "reward_std": 0.8315574526786804, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5147298574447632, "sampling/importance_sampling_ratio/mean": 1.0001040697097778, "sampling/importance_sampling_ratio/min": 0.5399750471115112, "sampling/sampling_logp_difference/max": 0.6162323951721191, "sampling/sampling_logp_difference/mean": 0.007726913318037987, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7951.0, "completions/max_terminated_length": 7951.0, "completions/mean_length": 2789.375, "completions/mean_terminated_length": 2789.375, "completions/min_length": 1132.0, "completions/min_terminated_length": 1132.0, "entropy": 0.3500289171934128, "epoch": 0.07933461292386436, "frac_reward_zero_std": 0.0, "grad_norm": 0.11486659039454267, "kl": 0.001127483876189217, "learning_rate": 9.84797365918718e-07, "loss": 0.0692, "num_tokens": 10400554.0, "reward": 0.75, "reward_std": 0.5970091223716736, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.9554049968719482, "sampling/importance_sampling_ratio/mean": 0.9999030232429504, "sampling/importance_sampling_ratio/min": 0.44586578011512756, "sampling/sampling_logp_difference/max": 0.8077373504638672, "sampling/sampling_logp_difference/mean": 0.00980476662516594, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3568.0, "completions/max_terminated_length": 3568.0, "completions/mean_length": 2287.875, "completions/mean_terminated_length": 2287.875, "completions/min_length": 1253.0, "completions/min_terminated_length": 1253.0, "entropy": 0.19350248202681541, "epoch": 0.0799744081893794, "frac_reward_zero_std": 0.0, "grad_norm": 0.08783354705825437, "kl": 0.0008390450821025297, "learning_rate": 9.845504498587406e-07, "loss": -0.1138, "num_tokens": 10476431.0, "reward": 0.875, "reward_std": 0.7801154851913452, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.7639849185943604, "sampling/importance_sampling_ratio/mean": 0.9999964237213135, "sampling/importance_sampling_ratio/min": 0.41163498163223267, "sampling/sampling_logp_difference/max": 0.8876183032989502, "sampling/sampling_logp_difference/mean": 0.005932595580816269, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3991.0, "completions/max_terminated_length": 3991.0, "completions/mean_length": 2148.041748046875, "completions/mean_terminated_length": 2148.041748046875, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "entropy": 0.28669821098446846, "epoch": 0.08061420345489444, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06551468018255376, "kl": 0.0009950985840987414, "learning_rate": 9.843015762136925e-07, "loss": -0.0317, "num_tokens": 10548088.0, "reward": 1.0833333730697632, "reward_std": 0.4232262969017029, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.5517724752426147, "sampling/importance_sampling_ratio/mean": 1.000080943107605, "sampling/importance_sampling_ratio/min": 0.5580407977104187, "sampling/sampling_logp_difference/max": 0.5833231806755066, "sampling/sampling_logp_difference/mean": 0.008220748044550419, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2468.0, "completions/max_terminated_length": 2468.0, "completions/mean_length": 1881.375, "completions/mean_terminated_length": 1881.375, "completions/min_length": 1439.0, "completions/min_terminated_length": 1439.0, "entropy": 0.2269131951034069, "epoch": 0.08125399872040946, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.059625958471333146, "kl": 0.0008961153216660023, "learning_rate": 9.840507459890242e-07, "loss": -0.0021, "num_tokens": 10617561.0, "reward": 1.9583333730697632, "reward_std": 0.1178511306643486, "rewards/accuracy_reward/mean": 0.9583333134651184, "rewards/accuracy_reward/std": 0.20412415266036987, "rewards/code_reward/mean": 1.0, "rewards/code_reward/std": 0.0, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001695156097412, "sampling/importance_sampling_ratio/min": 0.24454481899738312, "sampling/sampling_logp_difference/max": 1.437699556350708, "sampling/sampling_logp_difference/mean": 0.00648651085793972, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5733.0, "completions/max_terminated_length": 5733.0, "completions/mean_length": 2342.375, "completions/mean_terminated_length": 2342.375, "completions/min_length": 791.0, "completions/min_terminated_length": 791.0, "entropy": 0.24835346266627312, "epoch": 0.0818937939859245, "frac_reward_zero_std": 0.0, "grad_norm": 0.10874063731808915, "kl": 0.0013624947459902614, "learning_rate": 9.837979601980901e-07, "loss": -0.0979, "num_tokens": 10696210.0, "reward": 1.2916667461395264, "reward_std": 0.6618844270706177, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5897761583328247, "sampling/importance_sampling_ratio/mean": 1.000157356262207, "sampling/importance_sampling_ratio/min": 0.5724811553955078, "sampling/sampling_logp_difference/max": 0.5577754974365234, "sampling/sampling_logp_difference/mean": 0.007309045176953077, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6130.0, "completions/mean_length": 3639.875, "completions/mean_terminated_length": 3441.95654296875, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.3974904865026474, "epoch": 0.08253358925143954, "frac_reward_zero_std": 0.0, "grad_norm": 0.08937215667174718, "kl": 0.0012234278547111899, "learning_rate": 9.835432198621457e-07, "loss": 0.1528, "num_tokens": 10798399.0, "reward": 0.875, "reward_std": 0.8565260171890259, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.8687667846679688, "sampling/importance_sampling_ratio/mean": 1.0000141859054565, "sampling/importance_sampling_ratio/min": 0.0993773341178894, "sampling/sampling_logp_difference/max": 2.308831214904785, "sampling/sampling_logp_difference/mean": 0.011553189717233181, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6284.0, "completions/max_terminated_length": 6284.0, "completions/mean_length": 2880.666748046875, "completions/mean_terminated_length": 2880.666748046875, "completions/min_length": 1200.0, "completions/min_terminated_length": 1200.0, "entropy": 0.36987627297639847, "epoch": 0.08317338451695458, "frac_reward_zero_std": 0.0, "grad_norm": 0.09114906194264796, "kl": 0.001260495831957087, "learning_rate": 9.832865260103423e-07, "loss": 0.1149, "num_tokens": 10887159.0, "reward": 1.0, "reward_std": 0.7410582304000854, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9997860789299011, "sampling/importance_sampling_ratio/min": 0.10698701441287994, "sampling/sampling_logp_difference/max": 2.2350478172302246, "sampling/sampling_logp_difference/mean": 0.010799629613757133, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8020.0, "completions/max_terminated_length": 8020.0, "completions/mean_length": 3733.166748046875, "completions/mean_terminated_length": 3733.166748046875, "completions/min_length": 1502.0, "completions/min_terminated_length": 1502.0, "entropy": 0.416974276304245, "epoch": 0.0838131797824696, "frac_reward_zero_std": 0.0, "grad_norm": 0.09195057223930331, "kl": 0.0013496605097316206, "learning_rate": 9.830278796797238e-07, "loss": 0.0292, "num_tokens": 10994267.0, "reward": 0.875, "reward_std": 0.6354264616966248, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6515499353408813, "sampling/importance_sampling_ratio/mean": 1.0000077486038208, "sampling/importance_sampling_ratio/min": 0.3649807572364807, "sampling/sampling_logp_difference/max": 1.0079107284545898, "sampling/sampling_logp_difference/mean": 0.011376752518117428, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6840.0, "completions/max_terminated_length": 6840.0, "completions/mean_length": 2806.70849609375, "completions/mean_terminated_length": 2806.70849609375, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "entropy": 0.3921313211321831, "epoch": 0.08445297504798464, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.14351610692116556, "kl": 0.0013573340838775039, "learning_rate": 9.827672819152222e-07, "loss": 0.011, "num_tokens": 11073188.0, "reward": 1.4166667461395264, "reward_std": 0.36585909128189087, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.668697714805603, "sampling/importance_sampling_ratio/mean": 1.000179648399353, "sampling/importance_sampling_ratio/min": 0.287142276763916, "sampling/sampling_logp_difference/max": 1.2477774620056152, "sampling/sampling_logp_difference/mean": 0.011767554096877575, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5653.0, "completions/max_terminated_length": 5653.0, "completions/mean_length": 2294.791748046875, "completions/mean_terminated_length": 2294.791748046875, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "entropy": 0.33717045933008194, "epoch": 0.08509277031349968, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09299059272755321, "kl": 0.0018217506876681, "learning_rate": 9.825047337696531e-07, "loss": -0.013, "num_tokens": 11138863.0, "reward": 0.5, "reward_std": 0.48678088188171387, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999903678894043, "sampling/importance_sampling_ratio/min": 0.3103809952735901, "sampling/sampling_logp_difference/max": 2.3379664421081543, "sampling/sampling_logp_difference/mean": 0.010032487101852894, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3569.0, "completions/max_terminated_length": 3569.0, "completions/mean_length": 1920.4583740234375, "completions/mean_terminated_length": 1920.4583740234375, "completions/min_length": 1128.0, "completions/min_terminated_length": 1128.0, "entropy": 0.30662602186203003, "epoch": 0.08573256557901472, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08498301000919478, "kl": 0.0011781325156334788, "learning_rate": 9.822402363037117e-07, "loss": -0.0569, "num_tokens": 11199642.0, "reward": 1.1666667461395264, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.7844425439834595, "sampling/importance_sampling_ratio/mean": 0.9999997019767761, "sampling/importance_sampling_ratio/min": 0.6419183015823364, "sampling/sampling_logp_difference/max": 0.5791060924530029, "sampling/sampling_logp_difference/mean": 0.00867033563554287, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4686.0, "completions/max_terminated_length": 4686.0, "completions/mean_length": 2812.25, "completions/mean_terminated_length": 2812.25, "completions/min_length": 1352.0, "completions/min_terminated_length": 1352.0, "entropy": 0.2999224364757538, "epoch": 0.08637236084452975, "frac_reward_zero_std": 0.0, "grad_norm": 0.11333680702207799, "kl": 0.0010092425654875115, "learning_rate": 9.819737905859684e-07, "loss": -0.0581, "num_tokens": 11283352.0, "reward": 1.4583333730697632, "reward_std": 0.7923169136047363, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.4733120203018188, "sampling/importance_sampling_ratio/mean": 0.999960720539093, "sampling/importance_sampling_ratio/min": 0.48431840538978577, "sampling/sampling_logp_difference/max": 0.7250127792358398, "sampling/sampling_logp_difference/mean": 0.00833617802709341, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5695.0, "completions/max_terminated_length": 5695.0, "completions/mean_length": 2631.25, "completions/mean_terminated_length": 2631.25, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "entropy": 0.42088671773672104, "epoch": 0.08701215611004479, "frac_reward_zero_std": 0.0, "grad_norm": 0.10342280716178136, "kl": 0.0014085829607211053, "learning_rate": 9.817053976928643e-07, "loss": -0.0347, "num_tokens": 11361990.0, "reward": 0.75, "reward_std": 0.7655571699142456, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.4222561120986938, "sampling/importance_sampling_ratio/mean": 1.0000284910202026, "sampling/importance_sampling_ratio/min": 0.28562813997268677, "sampling/sampling_logp_difference/max": 1.253064513206482, "sampling/sampling_logp_difference/mean": 0.011888442561030388, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5113.0, "completions/max_terminated_length": 5113.0, "completions/mean_length": 2829.83349609375, "completions/mean_terminated_length": 2829.83349609375, "completions/min_length": 1563.0, "completions/min_terminated_length": 1563.0, "entropy": 0.3028028607368469, "epoch": 0.08765195137555983, "frac_reward_zero_std": 0.0, "grad_norm": 0.10178223787613755, "kl": 0.001152879005530849, "learning_rate": 9.814350587087082e-07, "loss": -0.0216, "num_tokens": 11455570.0, "reward": 0.5416666865348816, "reward_std": 0.6592972278594971, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.8931385278701782, "sampling/importance_sampling_ratio/mean": 1.0001392364501953, "sampling/importance_sampling_ratio/min": 0.5387505292892456, "sampling/sampling_logp_difference/max": 0.6382360458374023, "sampling/sampling_logp_difference/mean": 0.008837200701236725, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7743.0, "completions/mean_length": 3310.45849609375, "completions/mean_terminated_length": 3098.217529296875, "completions/min_length": 51.0, "completions/min_terminated_length": 51.0, "entropy": 0.5407263934612274, "epoch": 0.08829174664107485, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08482424858872144, "kl": 0.0016552802408114076, "learning_rate": 9.811627747256693e-07, "loss": -0.0921, "num_tokens": 11545485.0, "reward": 0.7083333730697632, "reward_std": 0.4655996263027191, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998679161071777, "sampling/importance_sampling_ratio/min": 0.5615704655647278, "sampling/sampling_logp_difference/max": 0.700777530670166, "sampling/sampling_logp_difference/mean": 0.011737285181879997, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 3120.0, "completions/mean_length": 2330.25, "completions/mean_terminated_length": 2075.391357421875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7217262759804726, "epoch": 0.08893154190658989, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06467700829371066, "kl": 0.005849264707649127, "learning_rate": 9.808885468437763e-07, "loss": -0.0029, "num_tokens": 11647867.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.4690499305725098, "sampling/importance_sampling_ratio/mean": 0.9997474551200867, "sampling/importance_sampling_ratio/min": 0.6456506848335266, "sampling/sampling_logp_difference/max": 0.4374966621398926, "sampling/sampling_logp_difference/mean": 0.008662140928208828, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7779.0, "completions/max_terminated_length": 7779.0, "completions/mean_length": 3104.875, "completions/mean_terminated_length": 3104.875, "completions/min_length": 1424.0, "completions/min_terminated_length": 1424.0, "entropy": 0.3028611093759537, "epoch": 0.08957133717210493, "frac_reward_zero_std": 0.0, "grad_norm": 0.09298504076176028, "kl": 0.0014160479622660205, "learning_rate": 9.806123761709103e-07, "loss": 0.0724, "num_tokens": 11741000.0, "reward": 1.2916667461395264, "reward_std": 0.8589093685150146, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999585151672363, "sampling/importance_sampling_ratio/min": 0.3920222520828247, "sampling/sampling_logp_difference/max": 0.936436653137207, "sampling/sampling_logp_difference/mean": 0.008893564343452454, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5855.0, "completions/max_terminated_length": 5855.0, "completions/mean_length": 3254.875, "completions/mean_terminated_length": 3254.875, "completions/min_length": 1472.0, "completions/min_terminated_length": 1472.0, "entropy": 0.29711710289120674, "epoch": 0.09021113243761997, "frac_reward_zero_std": 0.0, "grad_norm": 0.09510999336034509, "kl": 0.0010234743531327695, "learning_rate": 9.803342638228013e-07, "loss": -0.1177, "num_tokens": 11830605.0, "reward": 0.9583333730697632, "reward_std": 0.6354264616966248, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.5695785284042358, "sampling/importance_sampling_ratio/mean": 1.0001976490020752, "sampling/importance_sampling_ratio/min": 0.6204731464385986, "sampling/sampling_logp_difference/max": 0.47727298736572266, "sampling/sampling_logp_difference/mean": 0.008494673296809196, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5106.0, "completions/max_terminated_length": 5106.0, "completions/mean_length": 2644.75, "completions/mean_terminated_length": 2644.75, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "entropy": 0.3657931834459305, "epoch": 0.09085092770313499, "frac_reward_zero_std": 0.0, "grad_norm": 0.10913012886164138, "kl": 0.0016896942688617855, "learning_rate": 9.800542109230246e-07, "loss": -0.1111, "num_tokens": 11908959.0, "reward": 0.6666666865348816, "reward_std": 0.7513124346733093, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.7716253995895386, "sampling/importance_sampling_ratio/mean": 0.9997608661651611, "sampling/importance_sampling_ratio/min": 0.696897566318512, "sampling/sampling_logp_difference/max": 0.5718975067138672, "sampling/sampling_logp_difference/mean": 0.01045970618724823, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7228.0, "completions/max_terminated_length": 7228.0, "completions/mean_length": 3230.166748046875, "completions/mean_terminated_length": 3230.166748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.34473128616809845, "epoch": 0.09149072296865003, "frac_reward_zero_std": 0.0, "grad_norm": 0.09338920372313754, "kl": 0.014672585908556357, "learning_rate": 9.797722186029938e-07, "loss": -0.0365, "num_tokens": 12008667.0, "reward": 0.5, "reward_std": 0.7568016052246094, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000739097595215, "sampling/importance_sampling_ratio/min": 0.5188302397727966, "sampling/sampling_logp_difference/max": 1.3602993488311768, "sampling/sampling_logp_difference/mean": 0.00938168540596962, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6622.0, "completions/max_terminated_length": 6622.0, "completions/mean_length": 3226.375, "completions/mean_terminated_length": 3226.375, "completions/min_length": 1325.0, "completions/min_terminated_length": 1325.0, "entropy": 0.3979756236076355, "epoch": 0.09213051823416507, "frac_reward_zero_std": 0.0, "grad_norm": 0.11475972702158212, "kl": 0.0011757590982597321, "learning_rate": 9.794882880019592e-07, "loss": 0.0652, "num_tokens": 12097260.0, "reward": 0.9166666865348816, "reward_std": 0.6582559943199158, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4518834352493286, "sampling/importance_sampling_ratio/mean": 0.9999675154685974, "sampling/importance_sampling_ratio/min": 0.37298110127449036, "sampling/sampling_logp_difference/max": 0.9862275123596191, "sampling/sampling_logp_difference/mean": 0.010617366060614586, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5466.0, "completions/max_terminated_length": 5466.0, "completions/mean_length": 2541.041748046875, "completions/mean_terminated_length": 2541.041748046875, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "entropy": 0.2949911393225193, "epoch": 0.0927703134996801, "frac_reward_zero_std": 0.0, "grad_norm": 0.09013526685551662, "kl": 0.0011382299417164177, "learning_rate": 9.792024202670008e-07, "loss": 0.0502, "num_tokens": 12173893.0, "reward": 0.7916666865348816, "reward_std": 0.7168253660202026, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.912355899810791, "sampling/importance_sampling_ratio/mean": 0.9999849200248718, "sampling/importance_sampling_ratio/min": 0.5412169098854065, "sampling/sampling_logp_difference/max": 0.6483359336853027, "sampling/sampling_logp_difference/mean": 0.008545596152544022, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6594.0, "completions/max_terminated_length": 6594.0, "completions/mean_length": 2385.041748046875, "completions/mean_terminated_length": 2385.041748046875, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "entropy": 0.36539843678474426, "epoch": 0.09341010876519514, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06883763701215209, "kl": 0.0011902795231435448, "learning_rate": 9.789146165530254e-07, "loss": 0.0585, "num_tokens": 12249550.0, "reward": 0.9166666865348816, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.3158247470855713, "sampling/importance_sampling_ratio/mean": 0.9999129772186279, "sampling/importance_sampling_ratio/min": 0.5966050028800964, "sampling/sampling_logp_difference/max": 0.5164999961853027, "sampling/sampling_logp_difference/mean": 0.010143565945327282, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4228.0, "completions/max_terminated_length": 4228.0, "completions/mean_length": 2413.625, "completions/mean_terminated_length": 2413.625, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "entropy": 0.20008930191397667, "epoch": 0.09404990403071017, "frac_reward_zero_std": 0.0, "grad_norm": 0.08625993652735542, "kl": 0.0011882565886480734, "learning_rate": 9.786248780227603e-07, "loss": -0.0547, "num_tokens": 12330733.0, "reward": 1.0833333730697632, "reward_std": 0.8443689346313477, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.6188688278198242, "sampling/importance_sampling_ratio/mean": 1.0000890493392944, "sampling/importance_sampling_ratio/min": 0.4758911728858948, "sampling/sampling_logp_difference/max": 0.7425661087036133, "sampling/sampling_logp_difference/mean": 0.006483612582087517, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7654.0, "completions/mean_length": 3017.041748046875, "completions/mean_terminated_length": 2546.591064453125, "completions/min_length": 832.0, "completions/min_terminated_length": 832.0, "entropy": 0.3272504508495331, "epoch": 0.09468969929622521, "frac_reward_zero_std": 0.0, "grad_norm": 0.10307399042513535, "kl": 0.001347172277746722, "learning_rate": 9.783332058467501e-07, "loss": 0.0634, "num_tokens": 12418822.0, "reward": 0.8333333730697632, "reward_std": 0.7568015456199646, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000022649765015, "sampling/importance_sampling_ratio/min": 0.3574959933757782, "sampling/sampling_logp_difference/max": 1.0286310911178589, "sampling/sampling_logp_difference/mean": 0.009331129491329193, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 8035.0, "completions/mean_length": 4461.375, "completions/mean_terminated_length": 4122.2275390625, "completions/min_length": 1663.0, "completions/min_terminated_length": 1663.0, "entropy": 0.3737086355686188, "epoch": 0.09532949456174024, "frac_reward_zero_std": 0.0, "grad_norm": 0.09562775337800886, "kl": 0.0012114033452235162, "learning_rate": 9.780396012033512e-07, "loss": 0.0174, "num_tokens": 12544175.0, "reward": 0.7083333730697632, "reward_std": 0.7583522796630859, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999892115592957, "sampling/importance_sampling_ratio/min": 0.41602322459220886, "sampling/sampling_logp_difference/max": 1.20128333568573, "sampling/sampling_logp_difference/mean": 0.010904356837272644, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7846.0, "completions/max_terminated_length": 7846.0, "completions/mean_length": 3179.58349609375, "completions/mean_terminated_length": 3179.58349609375, "completions/min_length": 1532.0, "completions/min_terminated_length": 1532.0, "entropy": 0.40681789070367813, "epoch": 0.09596928982725528, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.7937030148582486, "kl": 0.002635878830915317, "learning_rate": 9.77744065278727e-07, "loss": -0.023, "num_tokens": 12642461.0, "reward": 0.4166666865348816, "reward_std": 0.5533831119537354, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000349283218384, "sampling/importance_sampling_ratio/min": 0.04645952582359314, "sampling/sampling_logp_difference/max": 3.069173812866211, "sampling/sampling_logp_difference/mean": 0.011541067622601986, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2944.0, "completions/max_terminated_length": 2944.0, "completions/mean_length": 1732.7083740234375, "completions/mean_terminated_length": 1732.7083740234375, "completions/min_length": 1013.0, "completions/min_terminated_length": 1013.0, "entropy": 0.19381669908761978, "epoch": 0.09660908509277032, "frac_reward_zero_std": 0.0, "grad_norm": 0.1061493306087895, "kl": 0.0014254739508032799, "learning_rate": 9.774465992668437e-07, "loss": 0.0805, "num_tokens": 12708662.0, "reward": 1.2083333730697632, "reward_std": 0.589255690574646, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4650764465332031, "sampling/importance_sampling_ratio/mean": 0.9999523162841797, "sampling/importance_sampling_ratio/min": 0.5834369659423828, "sampling/sampling_logp_difference/max": 0.5388188362121582, "sampling/sampling_logp_difference/mean": 0.005971837323158979, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7374.0, "completions/max_terminated_length": 7374.0, "completions/mean_length": 2663.08349609375, "completions/mean_terminated_length": 2663.08349609375, "completions/min_length": 1223.0, "completions/min_terminated_length": 1223.0, "entropy": 0.35547055304050446, "epoch": 0.09724888035828536, "frac_reward_zero_std": 0.0, "grad_norm": 0.10559888639297434, "kl": 0.0011785085080191493, "learning_rate": 9.771472043694644e-07, "loss": 0.0584, "num_tokens": 12795008.0, "reward": 0.9166666865348816, "reward_std": 0.5989742279052734, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.3643404245376587, "sampling/importance_sampling_ratio/mean": 0.9999713897705078, "sampling/importance_sampling_ratio/min": 0.6158185601234436, "sampling/sampling_logp_difference/max": 0.48480284214019775, "sampling/sampling_logp_difference/mean": 0.009853608906269073, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 6130.0, "completions/mean_length": 3665.75, "completions/mean_terminated_length": 3254.272705078125, "completions/min_length": 1485.0, "completions/min_terminated_length": 1485.0, "entropy": 0.42592156678438187, "epoch": 0.09788867562380038, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07252532858255552, "kl": 0.0012395988160278648, "learning_rate": 9.768458817961453e-07, "loss": 0.0257, "num_tokens": 12894266.0, "reward": 0.625, "reward_std": 0.6504079103469849, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.712336778640747, "sampling/importance_sampling_ratio/mean": 0.9998922348022461, "sampling/importance_sampling_ratio/min": 0.5353351831436157, "sampling/sampling_logp_difference/max": 0.6248621940612793, "sampling/sampling_logp_difference/mean": 0.011649003252387047, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7059.0, "completions/mean_length": 3568.791748046875, "completions/mean_terminated_length": 3367.78271484375, "completions/min_length": 1443.0, "completions/min_terminated_length": 1443.0, "entropy": 0.38343729078769684, "epoch": 0.09852847088931542, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.03014254933055635, "kl": 0.0010382912441855296, "learning_rate": 9.765426327642306e-07, "loss": 0.0079, "num_tokens": 12997197.0, "reward": 0.25, "reward_std": 0.15430335700511932, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.6831371784210205, "sampling/importance_sampling_ratio/mean": 0.9999809861183167, "sampling/importance_sampling_ratio/min": 0.503947377204895, "sampling/sampling_logp_difference/max": 0.6852834224700928, "sampling/sampling_logp_difference/mean": 0.010347530245780945, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7508.0, "completions/max_terminated_length": 7508.0, "completions/mean_length": 3437.125, "completions/mean_terminated_length": 3437.125, "completions/min_length": 1175.0, "completions/min_terminated_length": 1175.0, "entropy": 0.3967345133423805, "epoch": 0.09916826615483046, "frac_reward_zero_std": 0.0, "grad_norm": 0.11670527127711057, "kl": 0.0011715787404682487, "learning_rate": 9.76237458498847e-07, "loss": 0.0901, "num_tokens": 13091368.0, "reward": 0.625, "reward_std": 0.5787960886955261, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.6273607015609741, "sampling/importance_sampling_ratio/mean": 1.000153660774231, "sampling/importance_sampling_ratio/min": 0.2957562804222107, "sampling/sampling_logp_difference/max": 1.218219518661499, "sampling/sampling_logp_difference/mean": 0.010982980951666832, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8124.0, "completions/mean_length": 3094.08349609375, "completions/mean_terminated_length": 2872.434814453125, "completions/min_length": 1172.0, "completions/min_terminated_length": 1172.0, "entropy": 0.2941536456346512, "epoch": 0.09980806142034548, "frac_reward_zero_std": 0.0, "grad_norm": 0.09471865363721908, "kl": 0.0019467987294774503, "learning_rate": 9.759303602328992e-07, "loss": 0.1123, "num_tokens": 13180034.0, "reward": 0.7916666865348816, "reward_std": 0.6592972278594971, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999567866325378, "sampling/importance_sampling_ratio/min": 0.15675194561481476, "sampling/sampling_logp_difference/max": 1.853090763092041, "sampling/sampling_logp_difference/mean": 0.00870753824710846, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7772.0, "completions/max_terminated_length": 7772.0, "completions/mean_length": 3206.45849609375, "completions/mean_terminated_length": 3206.45849609375, "completions/min_length": 1459.0, "completions/min_terminated_length": 1459.0, "entropy": 0.282423447817564, "epoch": 0.10044785668586052, "frac_reward_zero_std": 0.0, "grad_norm": 0.10137581057872203, "kl": 0.0012937573192175478, "learning_rate": 9.756213392070653e-07, "loss": 0.1747, "num_tokens": 13274557.0, "reward": 1.4583333730697632, "reward_std": 0.697779655456543, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.9306411743164062, "sampling/importance_sampling_ratio/mean": 0.9999890923500061, "sampling/importance_sampling_ratio/min": 0.5286362171173096, "sampling/sampling_logp_difference/max": 0.6578521728515625, "sampling/sampling_logp_difference/mean": 0.00885259360074997, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8166.0, "completions/max_terminated_length": 8166.0, "completions/mean_length": 3630.625, "completions/mean_terminated_length": 3630.625, "completions/min_length": 1182.0, "completions/min_terminated_length": 1182.0, "entropy": 0.3725673332810402, "epoch": 0.10108765195137556, "frac_reward_zero_std": 0.0, "grad_norm": 0.09818930178637292, "kl": 0.001151596225099638, "learning_rate": 9.75310396669791e-07, "loss": 0.0104, "num_tokens": 13375236.0, "reward": 1.3333333730697632, "reward_std": 0.5201624631881714, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5950242280960083, "sampling/importance_sampling_ratio/mean": 1.0000317096710205, "sampling/importance_sampling_ratio/min": 0.4903479218482971, "sampling/sampling_logp_difference/max": 0.7126400470733643, "sampling/sampling_logp_difference/mean": 0.010001990012824535, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7966.0, "completions/mean_length": 5492.2919921875, "completions/mean_terminated_length": 5374.9130859375, "completions/min_length": 2339.0, "completions/min_terminated_length": 2339.0, "entropy": 0.5660589188337326, "epoch": 0.1017274472168906, "frac_reward_zero_std": 0.0, "grad_norm": 0.09031132166738998, "kl": 0.0013585734413936734, "learning_rate": 9.749975338772846e-07, "loss": 0.0099, "num_tokens": 13519587.0, "reward": 0.625, "reward_std": 0.8296798467636108, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001652240753174, "sampling/importance_sampling_ratio/min": 0.41673699021339417, "sampling/sampling_logp_difference/max": 0.8889522552490234, "sampling/sampling_logp_difference/mean": 0.014480629935860634, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5901.0, "completions/max_terminated_length": 5901.0, "completions/mean_length": 3226.95849609375, "completions/mean_terminated_length": 3226.95849609375, "completions/min_length": 1197.0, "completions/min_terminated_length": 1197.0, "entropy": 0.46725908666849136, "epoch": 0.10236724248240563, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06734792533536502, "kl": 0.0014440687955357134, "learning_rate": 9.746827520935127e-07, "loss": 0.0939, "num_tokens": 13606282.0, "reward": 0.9583333730697632, "reward_std": 0.4261821210384369, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.7001142501831055, "sampling/importance_sampling_ratio/mean": 0.9999176859855652, "sampling/importance_sampling_ratio/min": 0.5460700988769531, "sampling/sampling_logp_difference/max": 0.6050078868865967, "sampling/sampling_logp_difference/mean": 0.012678943574428558, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2656.0, "completions/max_terminated_length": 2656.0, "completions/mean_length": 1767.25, "completions/mean_terminated_length": 1767.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25772928446531296, "epoch": 0.10300703774792067, "frac_reward_zero_std": 0.0, "grad_norm": 0.09041962952663789, "kl": 0.00104872090741992, "learning_rate": 9.74366052590195e-07, "loss": -0.0698, "num_tokens": 13672168.0, "reward": 1.375, "reward_std": 0.6439208984375, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.5115036964416504, "sampling/importance_sampling_ratio/mean": 1.0001307725906372, "sampling/importance_sampling_ratio/min": 0.5872364640235901, "sampling/sampling_logp_difference/max": 0.5323276519775391, "sampling/sampling_logp_difference/mean": 0.006615546997636557, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6605.0, "completions/mean_length": 2825.70849609375, "completions/mean_terminated_length": 2592.391357421875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5080314874649048, "epoch": 0.1036468330134357, "frac_reward_zero_std": 0.0, "grad_norm": 0.11114057936612497, "kl": 0.020139981963438913, "learning_rate": 9.74047436646798e-07, "loss": -0.0355, "num_tokens": 13755153.0, "reward": 0.625, "reward_std": 0.45032867789268494, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.407486915588379, "sampling/importance_sampling_ratio/mean": 1.0000402927398682, "sampling/importance_sampling_ratio/min": 0.42663654685020447, "sampling/sampling_logp_difference/max": 0.8518228530883789, "sampling/sampling_logp_difference/mean": 0.011744337156414986, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7009.0, "completions/max_terminated_length": 7009.0, "completions/mean_length": 2437.70849609375, "completions/mean_terminated_length": 2437.70849609375, "completions/min_length": 682.0, "completions/min_terminated_length": 682.0, "entropy": 0.3707592561841011, "epoch": 0.10428662827895073, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11220264887244313, "kl": 0.0013941571523901075, "learning_rate": 9.737269055505305e-07, "loss": -0.0146, "num_tokens": 13824530.0, "reward": 0.9166666865348816, "reward_std": 0.49601587653160095, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000240802764893, "sampling/importance_sampling_ratio/min": 0.7026947140693665, "sampling/sampling_logp_difference/max": 0.9319295883178711, "sampling/sampling_logp_difference/mean": 0.010315672494471073, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4219.0, "completions/max_terminated_length": 4219.0, "completions/mean_length": 2158.791748046875, "completions/mean_terminated_length": 2158.791748046875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "entropy": 0.27189667895436287, "epoch": 0.10492642354446577, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08814561916955772, "kl": 0.0011720672191586345, "learning_rate": 9.7340446059634e-07, "loss": -0.0339, "num_tokens": 13891221.0, "reward": 1.5, "reward_std": 0.6083246469497681, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.701257586479187, "sampling/importance_sampling_ratio/mean": 0.9998863339424133, "sampling/importance_sampling_ratio/min": 0.45689672231674194, "sampling/sampling_logp_difference/max": 0.7832979559898376, "sampling/sampling_logp_difference/mean": 0.007892790250480175, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3417.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 1641.125, "completions/mean_terminated_length": 1641.125, "completions/min_length": 648.0, "completions/min_terminated_length": 648.0, "entropy": 0.23111482337117195, "epoch": 0.10556621880998081, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.12256121919125189, "kl": 0.0029551163024734706, "learning_rate": 9.730801030869039e-07, "loss": 0.0913, "num_tokens": 13944384.0, "reward": 1.5416667461395264, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.490012526512146, "sampling/importance_sampling_ratio/mean": 1.000020146369934, "sampling/importance_sampling_ratio/min": 0.475975900888443, "sampling/sampling_logp_difference/max": 0.7423880100250244, "sampling/sampling_logp_difference/mean": 0.007265336811542511, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4563.0, "completions/max_terminated_length": 4563.0, "completions/mean_length": 1905.666748046875, "completions/mean_terminated_length": 1905.666748046875, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 0.2908911257982254, "epoch": 0.10620601407549585, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11193222381413012, "kl": 0.01651803561253473, "learning_rate": 9.727538343326278e-07, "loss": -0.1098, "num_tokens": 14010200.0, "reward": 0.9583333730697632, "reward_std": 0.4999842643737793, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.410478949546814, "sampling/importance_sampling_ratio/mean": 0.9997660517692566, "sampling/importance_sampling_ratio/min": 0.5124700665473938, "sampling/sampling_logp_difference/max": 0.6685130596160889, "sampling/sampling_logp_difference/mean": 0.008069831877946854, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4789.0, "completions/max_terminated_length": 4789.0, "completions/mean_length": 2544.666748046875, "completions/mean_terminated_length": 2544.666748046875, "completions/min_length": 956.0, "completions/min_terminated_length": 956.0, "entropy": 0.347372829914093, "epoch": 0.10684580934101087, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10163369547845116, "kl": 0.0016547393461223692, "learning_rate": 9.724256556516381e-07, "loss": 0.0395, "num_tokens": 14084264.0, "reward": 1.125, "reward_std": 0.5410773754119873, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.6613552570343018, "sampling/importance_sampling_ratio/mean": 1.000008225440979, "sampling/importance_sampling_ratio/min": 0.6903844475746155, "sampling/sampling_logp_difference/max": 0.5076336860656738, "sampling/sampling_logp_difference/mean": 0.009965892881155014, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7768.0, "completions/mean_length": 5063.625, "completions/mean_terminated_length": 4616.71435546875, "completions/min_length": 1527.0, "completions/min_terminated_length": 1527.0, "entropy": 0.50084538012743, "epoch": 0.10748560460652591, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.056310739231724694, "kl": 0.0012337381485849619, "learning_rate": 9.72095568369778e-07, "loss": 0.0573, "num_tokens": 14225727.0, "reward": 0.2083333432674408, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5806130170822144, "sampling/importance_sampling_ratio/mean": 0.9999106526374817, "sampling/importance_sampling_ratio/min": 0.43061432242393494, "sampling/sampling_logp_difference/max": 0.8425424098968506, "sampling/sampling_logp_difference/mean": 0.013030621223151684, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3924.0, "completions/max_terminated_length": 3924.0, "completions/mean_length": 2676.45849609375, "completions/mean_terminated_length": 2676.45849609375, "completions/min_length": 1590.0, "completions/min_terminated_length": 1590.0, "entropy": 0.31655149161815643, "epoch": 0.10812539987204095, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08444854165982048, "kl": 0.002683785278350115, "learning_rate": 9.717635738206007e-07, "loss": -0.002, "num_tokens": 14309290.0, "reward": 1.0, "reward_std": 0.5807350873947144, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.4491668939590454, "sampling/importance_sampling_ratio/mean": 1.0000216960906982, "sampling/importance_sampling_ratio/min": 0.5718480348587036, "sampling/sampling_logp_difference/max": 0.5588819980621338, "sampling/sampling_logp_difference/mean": 0.008750181645154953, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7123.0, "completions/mean_length": 4444.33349609375, "completions/mean_terminated_length": 4103.63671875, "completions/min_length": 1968.0, "completions/min_terminated_length": 1968.0, "entropy": 0.3330667167901993, "epoch": 0.10876519513755598, "frac_reward_zero_std": 0.0, "grad_norm": 0.0829020963230217, "kl": 0.0010635624348651618, "learning_rate": 9.714296733453649e-07, "loss": 0.1322, "num_tokens": 14432026.0, "reward": 0.75, "reward_std": 0.834788978099823, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.900130271911621, "sampling/importance_sampling_ratio/mean": 0.9999024271965027, "sampling/importance_sampling_ratio/min": 0.48353081941604614, "sampling/sampling_logp_difference/max": 0.7266402244567871, "sampling/sampling_logp_difference/mean": 0.009753350168466568, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2471.0, "completions/max_terminated_length": 2471.0, "completions/mean_length": 1451.916748046875, "completions/mean_terminated_length": 1451.916748046875, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.26107481867074966, "epoch": 0.10940499040307101, "frac_reward_zero_std": 0.0, "grad_norm": 0.13871980558932967, "kl": 0.001718692306894809, "learning_rate": 9.710938682930297e-07, "loss": 0.1202, "num_tokens": 14480352.0, "reward": 0.8333333730697632, "reward_std": 0.3900056481361389, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000109314918518, "sampling/importance_sampling_ratio/min": 0.785708487033844, "sampling/sampling_logp_difference/max": 1.0259966850280762, "sampling/sampling_logp_difference/mean": 0.007555713411420584, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6799.0, "completions/max_terminated_length": 6799.0, "completions/mean_length": 2529.5, "completions/mean_terminated_length": 2529.5, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "entropy": 0.36792518198490143, "epoch": 0.11004478566858605, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0987883599854079, "kl": 0.0016882918716873974, "learning_rate": 9.70756160020248e-07, "loss": -0.0456, "num_tokens": 14561100.0, "reward": 1.3333333730697632, "reward_std": 0.44819486141204834, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.4231855869293213, "sampling/importance_sampling_ratio/mean": 1.000085473060608, "sampling/importance_sampling_ratio/min": 0.39518943428993225, "sampling/sampling_logp_difference/max": 0.9283900260925293, "sampling/sampling_logp_difference/mean": 0.010401963256299496, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3722.0, "completions/max_terminated_length": 3722.0, "completions/mean_length": 2012.916748046875, "completions/mean_terminated_length": 2012.916748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.22993025928735733, "epoch": 0.11068458093410109, "frac_reward_zero_std": 0.0, "grad_norm": 0.08758220756119216, "kl": 0.1539736344711855, "learning_rate": 9.704165498913623e-07, "loss": -0.029, "num_tokens": 14631914.0, "reward": 1.375, "reward_std": 0.8496841192245483, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6589566469192505, "sampling/importance_sampling_ratio/mean": 0.9999445080757141, "sampling/importance_sampling_ratio/min": 0.543402910232544, "sampling/sampling_logp_difference/max": 0.6099042892456055, "sampling/sampling_logp_difference/mean": 0.006673832889646292, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3588.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 2233.791748046875, "completions/mean_terminated_length": 2233.791748046875, "completions/min_length": 1063.0, "completions/min_terminated_length": 1063.0, "entropy": 0.35510464012622833, "epoch": 0.11132437619961612, "frac_reward_zero_std": 0.0, "grad_norm": 0.11211434329284474, "kl": 0.0017081423138733953, "learning_rate": 9.700750392783986e-07, "loss": -0.0159, "num_tokens": 14699693.0, "reward": 0.9166666865348816, "reward_std": 0.9006572961807251, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.619582176208496, "sampling/importance_sampling_ratio/mean": 0.999833881855011, "sampling/importance_sampling_ratio/min": 0.7119559645652771, "sampling/sampling_logp_difference/max": 0.48216819763183594, "sampling/sampling_logp_difference/mean": 0.010454259812831879, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4643.0, "completions/max_terminated_length": 4643.0, "completions/mean_length": 2075.75, "completions/mean_terminated_length": 2075.75, "completions/min_length": 879.0, "completions/min_terminated_length": 879.0, "entropy": 0.33153658360242844, "epoch": 0.11196417146513116, "frac_reward_zero_std": 0.0, "grad_norm": 0.10721583115351375, "kl": 0.0012165808293502778, "learning_rate": 9.697316295610603e-07, "loss": 0.0067, "num_tokens": 14758223.0, "reward": 1.5416667461395264, "reward_std": 0.622288167476654, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.6571840047836304, "sampling/importance_sampling_ratio/mean": 1.0000665187835693, "sampling/importance_sampling_ratio/min": 0.6976196765899658, "sampling/sampling_logp_difference/max": 0.505119800567627, "sampling/sampling_logp_difference/mean": 0.008829954080283642, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3027.0, "completions/max_terminated_length": 3027.0, "completions/mean_length": 1776.666748046875, "completions/mean_terminated_length": 1776.666748046875, "completions/min_length": 849.0, "completions/min_terminated_length": 849.0, "entropy": 0.30310819298028946, "epoch": 0.1126039667306462, "frac_reward_zero_std": 0.0, "grad_norm": 0.12437313971727897, "kl": 0.0015097715076990426, "learning_rate": 9.693863221267237e-07, "loss": 0.0568, "num_tokens": 14819559.0, "reward": 1.7083333730697632, "reward_std": 0.6621601581573486, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5044423341751099, "sampling/importance_sampling_ratio/mean": 1.000116229057312, "sampling/importance_sampling_ratio/min": 0.7001886367797852, "sampling/sampling_logp_difference/max": 0.40842223167419434, "sampling/sampling_logp_difference/mean": 0.008681269362568855, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6826.0, "completions/max_terminated_length": 6826.0, "completions/mean_length": 1832.7083740234375, "completions/mean_terminated_length": 1832.7083740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41932713985443115, "epoch": 0.11324376199616124, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09971765838624438, "kl": 0.06442484247963876, "learning_rate": 9.690391183704316e-07, "loss": -0.1296, "num_tokens": 14878088.0, "reward": 1.0, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.865991234779358, "sampling/importance_sampling_ratio/mean": 0.9998331665992737, "sampling/importance_sampling_ratio/min": 0.322128564119339, "sampling/sampling_logp_difference/max": 1.1328045129776, "sampling/sampling_logp_difference/mean": 0.007935372181236744, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3867.0, "completions/max_terminated_length": 3867.0, "completions/mean_length": 1661.3333740234375, "completions/mean_terminated_length": 1661.3333740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.28696662187576294, "epoch": 0.11388355726167626, "frac_reward_zero_std": 0.0, "grad_norm": 0.0835562092805831, "kl": 0.046472405025269836, "learning_rate": 9.686900196948883e-07, "loss": -0.1125, "num_tokens": 14936072.0, "reward": 1.0833333730697632, "reward_std": 0.5201624631881714, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000172734260559, "sampling/importance_sampling_ratio/min": 0.455529123544693, "sampling/sampling_logp_difference/max": 1.0887043476104736, "sampling/sampling_logp_difference/mean": 0.005160085391253233, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3006.0, "completions/max_terminated_length": 3006.0, "completions/mean_length": 1787.166748046875, "completions/mean_terminated_length": 1787.166748046875, "completions/min_length": 1379.0, "completions/min_terminated_length": 1379.0, "entropy": 0.2983502447605133, "epoch": 0.1145233525271913, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.14183523569305131, "kl": 0.0019425797800067812, "learning_rate": 9.683390275104533e-07, "loss": 0.0297, "num_tokens": 14992892.0, "reward": 0.9583333730697632, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4645631313323975, "sampling/importance_sampling_ratio/mean": 0.9998889565467834, "sampling/importance_sampling_ratio/min": 0.5440816879272461, "sampling/sampling_logp_difference/max": 0.6086559295654297, "sampling/sampling_logp_difference/mean": 0.008613012731075287, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3046.0, "completions/max_terminated_length": 3046.0, "completions/mean_length": 2072.75, "completions/mean_terminated_length": 2072.75, "completions/min_length": 1517.0, "completions/min_terminated_length": 1517.0, "entropy": 0.2346906177699566, "epoch": 0.11516314779270634, "frac_reward_zero_std": 0.0, "grad_norm": 0.09353347120755967, "kl": 0.0015660607896279544, "learning_rate": 9.679861432351358e-07, "loss": -0.0024, "num_tokens": 15056846.0, "reward": 1.1666667461395264, "reward_std": 0.9986722469329834, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.6559308767318726, "sampling/importance_sampling_ratio/mean": 1.0000766515731812, "sampling/importance_sampling_ratio/min": 0.7078150510787964, "sampling/sampling_logp_difference/max": 0.5043632984161377, "sampling/sampling_logp_difference/mean": 0.007147596217691898, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5441.0, "completions/max_terminated_length": 5441.0, "completions/mean_length": 2448.041748046875, "completions/mean_terminated_length": 2448.041748046875, "completions/min_length": 1049.0, "completions/min_terminated_length": 1049.0, "entropy": 0.3211091235280037, "epoch": 0.11580294305822136, "frac_reward_zero_std": 0.0, "grad_norm": 0.09843201780571185, "kl": 0.001863524754298851, "learning_rate": 9.676313682945893e-07, "loss": -0.0928, "num_tokens": 15126943.0, "reward": 0.75, "reward_std": 0.5775296688079834, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.8285752534866333, "sampling/importance_sampling_ratio/mean": 0.999862015247345, "sampling/importance_sampling_ratio/min": 0.610332727432251, "sampling/sampling_logp_difference/max": 0.6035370826721191, "sampling/sampling_logp_difference/mean": 0.00947299599647522, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8172.0, "completions/max_terminated_length": 8172.0, "completions/mean_length": 4604.1669921875, "completions/mean_terminated_length": 4604.1669921875, "completions/min_length": 2080.0, "completions/min_terminated_length": 2080.0, "entropy": 0.48058895021677017, "epoch": 0.1164427383237364, "frac_reward_zero_std": 0.0, "grad_norm": 0.08794207913633333, "kl": 0.0013300616119522601, "learning_rate": 9.672747041221055e-07, "loss": -0.109, "num_tokens": 15249187.0, "reward": 0.8333333730697632, "reward_std": 0.8114668726921082, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.9485688209533691, "sampling/importance_sampling_ratio/mean": 0.9999573826789856, "sampling/importance_sampling_ratio/min": 0.4132603406906128, "sampling/sampling_logp_difference/max": 0.8836774826049805, "sampling/sampling_logp_difference/mean": 0.012393591925501823, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7671.0, "completions/mean_length": 3482.95849609375, "completions/mean_terminated_length": 3278.217529296875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.40669015794992447, "epoch": 0.11708253358925144, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11092584845566014, "kl": 0.13304297454305924, "learning_rate": 9.669161521586083e-07, "loss": 0.0095, "num_tokens": 15351450.0, "reward": 1.6666667461395264, "reward_std": 0.5807350873947144, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.603561520576477, "sampling/importance_sampling_ratio/mean": 1.000034213066101, "sampling/importance_sampling_ratio/min": 0.5859227180480957, "sampling/sampling_logp_difference/max": 0.5345673561096191, "sampling/sampling_logp_difference/mean": 0.01126231998205185, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5067.0, "completions/max_terminated_length": 5067.0, "completions/mean_length": 2735.166748046875, "completions/mean_terminated_length": 2735.166748046875, "completions/min_length": 1395.0, "completions/min_terminated_length": 1395.0, "entropy": 0.32151492685079575, "epoch": 0.11772232885476648, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08796321555769143, "kl": 0.0012528025545179844, "learning_rate": 9.665557138526492e-07, "loss": -0.0246, "num_tokens": 15431022.0, "reward": 0.6666666865348816, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5968343019485474, "sampling/importance_sampling_ratio/mean": 0.9999906420707703, "sampling/importance_sampling_ratio/min": 0.4032776951789856, "sampling/sampling_logp_difference/max": 0.9081299304962158, "sampling/sampling_logp_difference/mean": 0.008852770552039146, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6622.0, "completions/max_terminated_length": 6622.0, "completions/mean_length": 3200.5, "completions/mean_terminated_length": 3200.5, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3705345243215561, "epoch": 0.1183621241202815, "frac_reward_zero_std": 0.0, "grad_norm": 0.08539570304042085, "kl": 0.04386327386600897, "learning_rate": 9.66193390660399e-07, "loss": -0.1227, "num_tokens": 15529082.0, "reward": 0.875, "reward_std": 0.7921559810638428, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000709295272827, "sampling/importance_sampling_ratio/min": 0.2528436779975891, "sampling/sampling_logp_difference/max": 1.935410737991333, "sampling/sampling_logp_difference/mean": 0.00946545135229826, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6901.0, "completions/max_terminated_length": 6901.0, "completions/mean_length": 3256.58349609375, "completions/mean_terminated_length": 3256.58349609375, "completions/min_length": 1144.0, "completions/min_terminated_length": 1144.0, "entropy": 0.25985143333673477, "epoch": 0.11900191938579655, "frac_reward_zero_std": 0.0, "grad_norm": 0.08056685559264454, "kl": 0.001414552389178425, "learning_rate": 9.658291840456452e-07, "loss": 0.0963, "num_tokens": 15632480.0, "reward": 1.0833333730697632, "reward_std": 0.7148863673210144, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000070333480835, "sampling/importance_sampling_ratio/min": 0.5187987685203552, "sampling/sampling_logp_difference/max": 0.987576961517334, "sampling/sampling_logp_difference/mean": 0.007627311162650585, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5971.0, "completions/max_terminated_length": 5971.0, "completions/mean_length": 3852.541748046875, "completions/mean_terminated_length": 3852.541748046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "entropy": 0.37653036415576935, "epoch": 0.11964171465131158, "frac_reward_zero_std": 0.0, "grad_norm": 0.08944139110535115, "kl": 0.0019841365865431726, "learning_rate": 9.654630954797827e-07, "loss": -0.0377, "num_tokens": 15738701.0, "reward": 0.5, "reward_std": 0.5858359336853027, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999433159828186, "sampling/importance_sampling_ratio/min": 0.45393115282058716, "sampling/sampling_logp_difference/max": 0.7898097038269043, "sampling/sampling_logp_difference/mean": 0.010744599625468254, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5945.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 2564.666748046875, "completions/mean_terminated_length": 2564.666748046875, "completions/min_length": 403.0, "completions/min_terminated_length": 403.0, "entropy": 0.33649132400751114, "epoch": 0.12028150991682661, "frac_reward_zero_std": 0.0, "grad_norm": 0.0967316676370209, "kl": 0.0031690552132204175, "learning_rate": 9.650951264418109e-07, "loss": -0.146, "num_tokens": 15813661.0, "reward": 0.6666666865348816, "reward_std": 0.6288648843765259, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.573612093925476, "sampling/importance_sampling_ratio/mean": 0.9999151229858398, "sampling/importance_sampling_ratio/min": 0.4161720275878906, "sampling/sampling_logp_difference/max": 0.8766565322875977, "sampling/sampling_logp_difference/mean": 0.009076835587620735, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7053.0, "completions/max_terminated_length": 7053.0, "completions/mean_length": 2808.666748046875, "completions/mean_terminated_length": 2808.666748046875, "completions/min_length": 1110.0, "completions/min_terminated_length": 1110.0, "entropy": 0.28207144141197205, "epoch": 0.12092130518234165, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08777123855768906, "kl": 0.001310580555582419, "learning_rate": 9.647252784183253e-07, "loss": 0.0525, "num_tokens": 15899101.0, "reward": 1.0833333730697632, "reward_std": 0.40627965331077576, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.557088017463684, "sampling/importance_sampling_ratio/mean": 0.9998623728752136, "sampling/importance_sampling_ratio/min": 0.3660735785961151, "sampling/sampling_logp_difference/max": 1.0049209594726562, "sampling/sampling_logp_difference/mean": 0.008841414004564285, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4666.0, "completions/max_terminated_length": 4666.0, "completions/mean_length": 1578.0, "completions/mean_terminated_length": 1578.0, "completions/min_length": 500.0, "completions/min_terminated_length": 500.0, "entropy": 0.26318780705332756, "epoch": 0.12156110044785669, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.14922752769079864, "kl": 0.0031699933169875294, "learning_rate": 9.64353552903513e-07, "loss": 0.1287, "num_tokens": 15950565.0, "reward": 1.5416667461395264, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.439249038696289, "sampling/importance_sampling_ratio/mean": 1.0000048875808716, "sampling/importance_sampling_ratio/min": 0.3751612901687622, "sampling/sampling_logp_difference/max": 0.9803992509841919, "sampling/sampling_logp_difference/mean": 0.0078009869903326035, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6581.0, "completions/max_terminated_length": 6581.0, "completions/mean_length": 2893.08349609375, "completions/mean_terminated_length": 2893.08349609375, "completions/min_length": 1053.0, "completions/min_terminated_length": 1053.0, "entropy": 0.3800032436847687, "epoch": 0.12220089571337173, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09608123769146155, "kl": 0.001668686221819371, "learning_rate": 9.639799513991459e-07, "loss": -0.0359, "num_tokens": 16036247.0, "reward": 1.375, "reward_std": 0.6866921186447144, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.7938120365142822, "sampling/importance_sampling_ratio/mean": 1.0000959634780884, "sampling/importance_sampling_ratio/min": 0.5394374132156372, "sampling/sampling_logp_difference/max": 0.6172284483909607, "sampling/sampling_logp_difference/mean": 0.010535255074501038, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 8036.0, "completions/mean_length": 3266.08349609375, "completions/mean_terminated_length": 2562.381103515625, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.47204573452472687, "epoch": 0.12284069097888675, "frac_reward_zero_std": 0.0, "grad_norm": 0.09543266949167681, "kl": 0.019206619210308418, "learning_rate": 9.636044754145752e-07, "loss": 0.0132, "num_tokens": 16136937.0, "reward": 0.5, "reward_std": 0.7568016052246094, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001283884048462, "sampling/importance_sampling_ratio/min": 0.42283567786216736, "sampling/sampling_logp_difference/max": 1.2943553924560547, "sampling/sampling_logp_difference/mean": 0.009988876059651375, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6234.0, "completions/max_terminated_length": 6234.0, "completions/mean_length": 3173.416748046875, "completions/mean_terminated_length": 3173.416748046875, "completions/min_length": 1515.0, "completions/min_terminated_length": 1515.0, "entropy": 0.379552498459816, "epoch": 0.12348048624440179, "frac_reward_zero_std": 0.0, "grad_norm": 0.10201212354025196, "kl": 0.0018219959747511894, "learning_rate": 9.632271264667248e-07, "loss": 0.0034, "num_tokens": 16225651.0, "reward": 0.875, "reward_std": 0.8565260171890259, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7817060947418213, "sampling/importance_sampling_ratio/mean": 1.0000554323196411, "sampling/importance_sampling_ratio/min": 0.4461058974266052, "sampling/sampling_logp_difference/max": 0.8071990013122559, "sampling/sampling_logp_difference/mean": 0.011079381220042706, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5616.0, "completions/max_terminated_length": 5616.0, "completions/mean_length": 2504.875, "completions/mean_terminated_length": 2504.875, "completions/min_length": 1246.0, "completions/min_terminated_length": 1246.0, "entropy": 0.30160418152809143, "epoch": 0.12412028150991683, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08945398919817663, "kl": 0.001502518804045394, "learning_rate": 9.628479060800853e-07, "loss": -0.0922, "num_tokens": 16304488.0, "reward": 1.2916667461395264, "reward_std": 0.4244926869869232, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.3885471820831299, "sampling/importance_sampling_ratio/mean": 0.9999129176139832, "sampling/importance_sampling_ratio/min": 0.563793957233429, "sampling/sampling_logp_difference/max": 0.5730664730072021, "sampling/sampling_logp_difference/mean": 0.0088003259152174, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7424.0, "completions/max_terminated_length": 7424.0, "completions/mean_length": 3130.875, "completions/mean_terminated_length": 3130.875, "completions/min_length": 980.0, "completions/min_terminated_length": 980.0, "entropy": 0.2341158650815487, "epoch": 0.12476007677543186, "frac_reward_zero_std": 0.0, "grad_norm": 0.0893305957996927, "kl": 0.001596681249793619, "learning_rate": 9.624668157867082e-07, "loss": 0.0094, "num_tokens": 16401629.0, "reward": 0.8333333730697632, "reward_std": 0.5440332293510437, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000070571899414, "sampling/importance_sampling_ratio/min": 0.2712688446044922, "sampling/sampling_logp_difference/max": 1.3046449422836304, "sampling/sampling_logp_difference/mean": 0.007321540266275406, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7004.0, "completions/max_terminated_length": 7004.0, "completions/mean_length": 2585.041748046875, "completions/mean_terminated_length": 2585.041748046875, "completions/min_length": 519.0, "completions/min_terminated_length": 519.0, "entropy": 0.31214287132024765, "epoch": 0.1253998720409469, "frac_reward_zero_std": 1.0, "grad_norm": 0.005011736947136607, "kl": 0.001955682091647759, "learning_rate": 9.620838571261993e-07, "loss": 0.0002, "num_tokens": 16478942.0, "reward": 0.3333333432674408, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.7734647989273071, "sampling/importance_sampling_ratio/mean": 1.0000208616256714, "sampling/importance_sampling_ratio/min": 0.48748496174812317, "sampling/sampling_logp_difference/max": 0.7184958457946777, "sampling/sampling_logp_difference/mean": 0.00900989305227995, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5851.0, "completions/max_terminated_length": 5851.0, "completions/mean_length": 2821.33349609375, "completions/mean_terminated_length": 2821.33349609375, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.2756134793162346, "epoch": 0.12603966730646193, "frac_reward_zero_std": 0.0, "grad_norm": 0.09139299813864474, "kl": 0.0012518968433141708, "learning_rate": 9.616990316457125e-07, "loss": -0.0411, "num_tokens": 16568942.0, "reward": 0.875, "reward_std": 0.824333131313324, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001286268234253, "sampling/importance_sampling_ratio/min": 0.5223491787910461, "sampling/sampling_logp_difference/max": 1.007288932800293, "sampling/sampling_logp_difference/mean": 0.008081885986030102, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6118.0, "completions/max_terminated_length": 6118.0, "completions/mean_length": 3169.166748046875, "completions/mean_terminated_length": 3169.166748046875, "completions/min_length": 920.0, "completions/min_terminated_length": 920.0, "entropy": 0.30752816796302795, "epoch": 0.12667946257197696, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07404338791626125, "kl": 0.0014699407038278878, "learning_rate": 9.613123408999438e-07, "loss": 0.0151, "num_tokens": 16658506.0, "reward": 1.3333333730697632, "reward_std": 0.49601587653160095, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.9326050281524658, "sampling/importance_sampling_ratio/mean": 1.0000333786010742, "sampling/importance_sampling_ratio/min": 0.14402595162391663, "sampling/sampling_logp_difference/max": 1.9377617835998535, "sampling/sampling_logp_difference/mean": 0.009327744133770466, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3618.0, "completions/max_terminated_length": 3618.0, "completions/mean_length": 1995.5, "completions/mean_terminated_length": 1995.5, "completions/min_length": 1200.0, "completions/min_terminated_length": 1200.0, "entropy": 0.2949874699115753, "epoch": 0.127319257837492, "frac_reward_zero_std": 0.0, "grad_norm": 0.16182186079353109, "kl": 0.0022535961179528385, "learning_rate": 9.609237864511247e-07, "loss": 0.0457, "num_tokens": 16721534.0, "reward": 0.6666666865348816, "reward_std": 0.8556844592094421, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.7466800212860107, "sampling/importance_sampling_ratio/mean": 1.000017762184143, "sampling/importance_sampling_ratio/min": 0.5777001976966858, "sampling/sampling_logp_difference/max": 0.5577167868614197, "sampling/sampling_logp_difference/mean": 0.008631477132439613, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6924.0, "completions/max_terminated_length": 6924.0, "completions/mean_length": 4054.5, "completions/mean_terminated_length": 4054.5, "completions/min_length": 1568.0, "completions/min_terminated_length": 1568.0, "entropy": 0.3461889773607254, "epoch": 0.12795905310300704, "frac_reward_zero_std": 0.0, "grad_norm": 0.09118613924592092, "kl": 0.0010691320057958364, "learning_rate": 9.605333698690164e-07, "loss": -0.017, "num_tokens": 16842570.0, "reward": 1.1666667461395264, "reward_std": 0.7565258741378784, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.4754372835159302, "sampling/importance_sampling_ratio/mean": 0.9999711513519287, "sampling/importance_sampling_ratio/min": 0.016417814418673515, "sampling/sampling_logp_difference/max": 4.10938835144043, "sampling/sampling_logp_difference/mean": 0.00967211090028286, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6561.0, "completions/max_terminated_length": 6561.0, "completions/mean_length": 3454.0, "completions/mean_terminated_length": 3454.0, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.459149107336998, "epoch": 0.12859884836852206, "frac_reward_zero_std": 0.0, "grad_norm": 0.0916518241036874, "kl": 0.09590687009040266, "learning_rate": 9.601410927309025e-07, "loss": -0.0268, "num_tokens": 16939514.0, "reward": 0.375, "reward_std": 0.5049939155578613, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4552072286605835, "sampling/importance_sampling_ratio/mean": 1.0001416206359863, "sampling/importance_sampling_ratio/min": 0.432792991399765, "sampling/sampling_logp_difference/max": 0.8374958038330078, "sampling/sampling_logp_difference/mean": 0.011776691302657127, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5493.0, "completions/mean_length": 2589.916748046875, "completions/mean_terminated_length": 2346.347900390625, "completions/min_length": 1061.0, "completions/min_terminated_length": 1061.0, "entropy": 0.3065914064645767, "epoch": 0.12923864363403711, "frac_reward_zero_std": 0.0, "grad_norm": 0.10753911788090896, "kl": 0.0015862122527323663, "learning_rate": 9.597469566215839e-07, "loss": 0.0863, "num_tokens": 17023960.0, "reward": 1.125, "reward_std": 0.7194125056266785, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.4390528202056885, "sampling/importance_sampling_ratio/mean": 1.0000691413879395, "sampling/importance_sampling_ratio/min": 0.3935733139514923, "sampling/sampling_logp_difference/max": 0.932487964630127, "sampling/sampling_logp_difference/mean": 0.008692947216331959, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7907.0, "completions/mean_length": 3521.125, "completions/mean_terminated_length": 3096.5, "completions/min_length": 883.0, "completions/min_terminated_length": 883.0, "entropy": 0.39361173659563065, "epoch": 0.12987843889955214, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05916808608663437, "kl": 0.0017365040257573128, "learning_rate": 9.593509631333717e-07, "loss": -0.0041, "num_tokens": 17122723.0, "reward": 0.9166666865348816, "reward_std": 0.39602547883987427, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5495667457580566, "sampling/importance_sampling_ratio/mean": 0.9999424815177917, "sampling/importance_sampling_ratio/min": 0.38554978370666504, "sampling/sampling_logp_difference/max": 0.9530849456787109, "sampling/sampling_logp_difference/mean": 0.011033166199922562, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5858.0, "completions/max_terminated_length": 5858.0, "completions/mean_length": 2453.95849609375, "completions/mean_terminated_length": 2453.95849609375, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "entropy": 0.32654376327991486, "epoch": 0.13051823416506717, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06580227331318791, "kl": 0.0016234063950832933, "learning_rate": 9.589531138660803e-07, "loss": 0.0569, "num_tokens": 17193714.0, "reward": 1.0, "reward_std": 0.48678088188171387, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.509507179260254, "sampling/importance_sampling_ratio/mean": 0.9999462962150574, "sampling/importance_sampling_ratio/min": 0.45525917410850525, "sampling/sampling_logp_difference/max": 0.7868884801864624, "sampling/sampling_logp_difference/mean": 0.008715780451893806, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3856.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 2178.33349609375, "completions/mean_terminated_length": 2178.33349609375, "completions/min_length": 1534.0, "completions/min_terminated_length": 1534.0, "entropy": 0.34024661779403687, "epoch": 0.13115802943058222, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0751557621448825, "kl": 0.001786675478797406, "learning_rate": 9.585534104270217e-07, "loss": -0.0371, "num_tokens": 17259850.0, "reward": 1.4166667461395264, "reward_std": 0.2903675436973572, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 1.689210057258606, "sampling/importance_sampling_ratio/mean": 1.0000804662704468, "sampling/importance_sampling_ratio/min": 0.655085563659668, "sampling/sampling_logp_difference/max": 0.5242609977722168, "sampling/sampling_logp_difference/mean": 0.00919214729219675, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4839.0, "completions/max_terminated_length": 4839.0, "completions/mean_length": 3148.33349609375, "completions/mean_terminated_length": 3148.33349609375, "completions/min_length": 1557.0, "completions/min_terminated_length": 1557.0, "entropy": 0.22845356911420822, "epoch": 0.13179782469609724, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06785333289063626, "kl": 0.001176911871880293, "learning_rate": 9.581518544309992e-07, "loss": 0.011, "num_tokens": 17362602.0, "reward": 1.1666667461395264, "reward_std": 0.30860671401023865, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.741204857826233, "sampling/importance_sampling_ratio/mean": 1.0001963376998901, "sampling/importance_sampling_ratio/min": 0.5664181709289551, "sampling/sampling_logp_difference/max": 0.5684225559234619, "sampling/sampling_logp_difference/mean": 0.006645712070167065, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4573.0, "completions/max_terminated_length": 4573.0, "completions/mean_length": 2153.125, "completions/mean_terminated_length": 2153.125, "completions/min_length": 1391.0, "completions/min_terminated_length": 1391.0, "entropy": 0.24926623329520226, "epoch": 0.1324376199616123, "frac_reward_zero_std": 0.0, "grad_norm": 0.10848952489142485, "kl": 0.002120224991813302, "learning_rate": 9.577484475002998e-07, "loss": -0.0892, "num_tokens": 17440925.0, "reward": 1.2083333730697632, "reward_std": 0.7168253660202026, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434515476227, "sampling/importance_sampling_ratio/max": 1.78864324092865, "sampling/importance_sampling_ratio/mean": 0.9998710751533508, "sampling/importance_sampling_ratio/min": 0.4033772051334381, "sampling/sampling_logp_difference/max": 0.9078831672668457, "sampling/sampling_logp_difference/mean": 0.007850997149944305, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4510.0, "completions/max_terminated_length": 4510.0, "completions/mean_length": 2495.70849609375, "completions/mean_terminated_length": 2495.70849609375, "completions/min_length": 581.0, "completions/min_terminated_length": 581.0, "entropy": 0.2647634744644165, "epoch": 0.13307741522712732, "frac_reward_zero_std": 0.0, "grad_norm": 0.08614016218945032, "kl": 0.0020515634096227586, "learning_rate": 9.573431912646887e-07, "loss": -0.028, "num_tokens": 17525390.0, "reward": 0.8333333730697632, "reward_std": 0.7463539838790894, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.684126377105713, "sampling/importance_sampling_ratio/mean": 0.9999478459358215, "sampling/importance_sampling_ratio/min": 0.3565734624862671, "sampling/sampling_logp_difference/max": 1.0312150716781616, "sampling/sampling_logp_difference/mean": 0.007982464507222176, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5986.0, "completions/max_terminated_length": 5986.0, "completions/mean_length": 3648.041748046875, "completions/mean_terminated_length": 3648.041748046875, "completions/min_length": 2050.0, "completions/min_terminated_length": 2050.0, "entropy": 0.483303464949131, "epoch": 0.13371721049264235, "frac_reward_zero_std": 0.0, "grad_norm": 0.0933233496862153, "kl": 0.0016679684340488166, "learning_rate": 9.569360873614019e-07, "loss": -0.0385, "num_tokens": 17623375.0, "reward": 0.8333333730697632, "reward_std": 0.7953876256942749, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.621727466583252, "sampling/importance_sampling_ratio/mean": 0.9999358057975769, "sampling/importance_sampling_ratio/min": 0.2241968959569931, "sampling/sampling_logp_difference/max": 1.4952306747436523, "sampling/sampling_logp_difference/mean": 0.012396524660289288, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6662.0, "completions/max_terminated_length": 6662.0, "completions/mean_length": 3676.416748046875, "completions/mean_terminated_length": 3676.416748046875, "completions/min_length": 1921.0, "completions/min_terminated_length": 1921.0, "entropy": 0.3886456787586212, "epoch": 0.1343570057581574, "frac_reward_zero_std": 0.0, "grad_norm": 0.12212198747381506, "kl": 0.002015983685851097, "learning_rate": 9.565271374351405e-07, "loss": 0.0339, "num_tokens": 17727889.0, "reward": 0.7916666865348816, "reward_std": 0.5685418844223022, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5720854997634888, "sampling/importance_sampling_ratio/mean": 1.0000377893447876, "sampling/importance_sampling_ratio/min": 0.5806993246078491, "sampling/sampling_logp_difference/max": 0.5435221195220947, "sampling/sampling_logp_difference/mean": 0.01085890457034111, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6883.0, "completions/mean_length": 2937.70849609375, "completions/mean_terminated_length": 2709.260986328125, "completions/min_length": 923.0, "completions/min_terminated_length": 923.0, "entropy": 0.33564162254333496, "epoch": 0.13499680102367242, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08796258805542305, "kl": 0.0018820841214619577, "learning_rate": 9.56116343138063e-07, "loss": 0.0297, "num_tokens": 17817050.0, "reward": 1.0416667461395264, "reward_std": 0.42645785212516785, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.99372136592865, "sampling/importance_sampling_ratio/mean": 0.9999620318412781, "sampling/importance_sampling_ratio/min": 0.5283352136611938, "sampling/sampling_logp_difference/max": 0.6900029182434082, "sampling/sampling_logp_difference/mean": 0.009243922308087349, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4718.0, "completions/max_terminated_length": 4718.0, "completions/mean_length": 2082.291748046875, "completions/mean_terminated_length": 2082.291748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.42102590948343277, "epoch": 0.13563659628918745, "frac_reward_zero_std": 0.0, "grad_norm": 0.12116170842250784, "kl": 0.03941970531013794, "learning_rate": 9.557037061297798e-07, "loss": -0.0561, "num_tokens": 17878409.0, "reward": 0.75, "reward_std": 0.7350384593009949, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.6566886901855469, "sampling/importance_sampling_ratio/mean": 0.9999774098396301, "sampling/importance_sampling_ratio/min": 0.47899144887924194, "sampling/sampling_logp_difference/max": 0.7360725402832031, "sampling/sampling_logp_difference/mean": 0.010601649060845375, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6851.0, "completions/max_terminated_length": 6851.0, "completions/mean_length": 4100.0, "completions/mean_terminated_length": 4100.0, "completions/min_length": 2046.0, "completions/min_terminated_length": 2046.0, "entropy": 0.41201725602149963, "epoch": 0.1362763915547025, "frac_reward_zero_std": 0.0, "grad_norm": 0.0940027865762411, "kl": 0.0016447036468889564, "learning_rate": 9.55289228077345e-07, "loss": -0.0166, "num_tokens": 17990369.0, "reward": 0.625, "reward_std": 0.6106518507003784, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000724792480469, "sampling/importance_sampling_ratio/min": 0.3860727548599243, "sampling/sampling_logp_difference/max": 1.089390754699707, "sampling/sampling_logp_difference/mean": 0.011259188875555992, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7366.0, "completions/mean_length": 4195.58349609375, "completions/mean_terminated_length": 3832.27294921875, "completions/min_length": 1940.0, "completions/min_terminated_length": 1940.0, "entropy": 0.3194342404603958, "epoch": 0.13691618682021753, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05796481534948504, "kl": 0.0018064509786199778, "learning_rate": 9.548729106552513e-07, "loss": 0.0244, "num_tokens": 18111967.0, "reward": 0.625, "reward_std": 0.48112308979034424, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000156164169312, "sampling/importance_sampling_ratio/min": 0.3465498685836792, "sampling/sampling_logp_difference/max": 1.0597285032272339, "sampling/sampling_logp_difference/mean": 0.009293889626860619, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5619.0, "completions/mean_length": 2824.5, "completions/mean_terminated_length": 2591.13037109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "entropy": 0.488766111433506, "epoch": 0.13755598208573255, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09593601629284765, "kl": 0.0015399438270833343, "learning_rate": 9.544547555454221e-07, "loss": -0.008, "num_tokens": 18210659.0, "reward": 0.5833333730697632, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.833430290222168, "sampling/importance_sampling_ratio/mean": 1.0000523328781128, "sampling/importance_sampling_ratio/min": 0.7099611163139343, "sampling/sampling_logp_difference/max": 0.6061887741088867, "sampling/sampling_logp_difference/mean": 0.01020234264433384, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5070.0, "completions/max_terminated_length": 5070.0, "completions/mean_length": 2779.25, "completions/mean_terminated_length": 2779.25, "completions/min_length": 1004.0, "completions/min_terminated_length": 1004.0, "entropy": 0.46586302667856216, "epoch": 0.1381957773512476, "frac_reward_zero_std": 0.0, "grad_norm": 0.1019607938890137, "kl": 0.002826534560881555, "learning_rate": 9.540347644372052e-07, "loss": 0.0432, "num_tokens": 18295393.0, "reward": 0.875, "reward_std": 0.7604942321777344, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.8084535598754883, "sampling/importance_sampling_ratio/mean": 1.0001496076583862, "sampling/importance_sampling_ratio/min": 0.48620662093162537, "sampling/sampling_logp_difference/max": 0.7211215496063232, "sampling/sampling_logp_difference/mean": 0.012676316313445568, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6385.0, "completions/max_terminated_length": 6385.0, "completions/mean_length": 2877.791748046875, "completions/mean_terminated_length": 2877.791748046875, "completions/min_length": 1259.0, "completions/min_terminated_length": 1259.0, "entropy": 0.2807503342628479, "epoch": 0.13883557261676263, "frac_reward_zero_std": 0.0, "grad_norm": 0.09929915671892534, "kl": 0.001696067425655201, "learning_rate": 9.536129390273657e-07, "loss": -0.0121, "num_tokens": 18378148.0, "reward": 1.25, "reward_std": 0.6024982333183289, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.9499975442886353, "sampling/importance_sampling_ratio/mean": 0.9999499320983887, "sampling/importance_sampling_ratio/min": 0.37102851271629333, "sampling/sampling_logp_difference/max": 0.9914764165878296, "sampling/sampling_logp_difference/mean": 0.00857233814895153, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3350.0, "completions/max_terminated_length": 3350.0, "completions/mean_length": 2447.70849609375, "completions/mean_terminated_length": 2447.70849609375, "completions/min_length": 1454.0, "completions/min_terminated_length": 1454.0, "entropy": 0.2760830782353878, "epoch": 0.13947536788227768, "frac_reward_zero_std": 0.0, "grad_norm": 0.09997384041915895, "kl": 0.002751219319179654, "learning_rate": 9.531892810200792e-07, "loss": 0.0475, "num_tokens": 18461381.0, "reward": 0.9166666865348816, "reward_std": 0.9347000122070312, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5222926139831543, "sampling/importance_sampling_ratio/mean": 0.9999480247497559, "sampling/importance_sampling_ratio/min": 0.5758744478225708, "sampling/sampling_logp_difference/max": 0.5518655776977539, "sampling/sampling_logp_difference/mean": 0.00809266697615385, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7537.0, "completions/max_terminated_length": 7537.0, "completions/mean_length": 3037.75, "completions/mean_terminated_length": 3037.75, "completions/min_length": 1750.0, "completions/min_terminated_length": 1750.0, "entropy": 0.2772749587893486, "epoch": 0.1401151631477927, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.028974829571211305, "kl": 0.0016259503900073469, "learning_rate": 9.527637921269254e-07, "loss": 0.0091, "num_tokens": 18548655.0, "reward": 0.5833333730697632, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999370574951172, "sampling/importance_sampling_ratio/min": 0.37339189648628235, "sampling/sampling_logp_difference/max": 1.3973784446716309, "sampling/sampling_logp_difference/mean": 0.008006365038454533, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5241.0, "completions/mean_length": 2666.45849609375, "completions/mean_terminated_length": 2426.217529296875, "completions/min_length": 668.0, "completions/min_terminated_length": 668.0, "entropy": 0.3135569542646408, "epoch": 0.14075495841330773, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07525423123213525, "kl": 0.0019503744842950255, "learning_rate": 9.523364740668804e-07, "loss": 0.1323, "num_tokens": 18632258.0, "reward": 1.25, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.6803537607192993, "sampling/importance_sampling_ratio/mean": 0.9998529553413391, "sampling/importance_sampling_ratio/min": 0.48174381256103516, "sampling/sampling_logp_difference/max": 0.7303428649902344, "sampling/sampling_logp_difference/mean": 0.009044930338859558, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2295.0, "completions/max_terminated_length": 2295.0, "completions/mean_length": 1603.916748046875, "completions/mean_terminated_length": 1603.916748046875, "completions/min_length": 1041.0, "completions/min_terminated_length": 1041.0, "entropy": 0.22684862837195396, "epoch": 0.1413947536788228, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0652690377497539, "kl": 0.0021644030639436096, "learning_rate": 9.519073285663102e-07, "loss": 0.0206, "num_tokens": 18688976.0, "reward": 1.5, "reward_std": 0.30860671401023865, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000914335250854, "sampling/importance_sampling_ratio/min": 0.4785533547401428, "sampling/sampling_logp_difference/max": 0.9006643295288086, "sampling/sampling_logp_difference/mean": 0.006764642428606749, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6946.0, "completions/mean_length": 4260.58349609375, "completions/mean_terminated_length": 4089.65234375, "completions/min_length": 1553.0, "completions/min_terminated_length": 1553.0, "entropy": 0.4027915969491005, "epoch": 0.1420345489443378, "frac_reward_zero_std": 0.0, "grad_norm": 0.08772237525712197, "kl": 0.0016806154744699597, "learning_rate": 9.514763573589639e-07, "loss": -0.0924, "num_tokens": 18803902.0, "reward": 0.8333333730697632, "reward_std": 0.7547006607055664, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000953674316406, "sampling/importance_sampling_ratio/min": 0.19022266566753387, "sampling/sampling_logp_difference/max": 1.659559965133667, "sampling/sampling_logp_difference/mean": 0.011038312688469887, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5475.0, "completions/max_terminated_length": 5475.0, "completions/mean_length": 2072.08349609375, "completions/mean_terminated_length": 2072.08349609375, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.3642254024744034, "epoch": 0.14267434420985284, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09081636787815711, "kl": 0.0018196023011114448, "learning_rate": 9.510435621859661e-07, "loss": -0.0223, "num_tokens": 18862752.0, "reward": 1.125, "reward_std": 0.29602527618408203, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.656120777130127, "sampling/importance_sampling_ratio/mean": 1.0000686645507812, "sampling/importance_sampling_ratio/min": 0.6034650802612305, "sampling/sampling_logp_difference/max": 0.5050671100616455, "sampling/sampling_logp_difference/mean": 0.010435044765472412, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3467.0, "completions/max_terminated_length": 3467.0, "completions/mean_length": 2365.0, "completions/mean_terminated_length": 2365.0, "completions/min_length": 1538.0, "completions/min_terminated_length": 1538.0, "entropy": 0.19235092028975487, "epoch": 0.1433141394753679, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06876977692699711, "kl": 0.0015213887963909656, "learning_rate": 9.506089447958106e-07, "loss": -0.0604, "num_tokens": 18940744.0, "reward": 1.2916667461395264, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.694148302078247, "sampling/importance_sampling_ratio/mean": 1.0000265836715698, "sampling/importance_sampling_ratio/min": 0.3218189477920532, "sampling/sampling_logp_difference/max": 1.1337661743164062, "sampling/sampling_logp_difference/mean": 0.005928413942456245, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5655.0, "completions/max_terminated_length": 5655.0, "completions/mean_length": 2903.541748046875, "completions/mean_terminated_length": 2903.541748046875, "completions/min_length": 919.0, "completions/min_terminated_length": 919.0, "entropy": 0.2603057287633419, "epoch": 0.14395393474088292, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09189142733143785, "kl": 0.0015440776478499174, "learning_rate": 9.501725069443527e-07, "loss": 0.1297, "num_tokens": 19023901.0, "reward": 1.3333333730697632, "reward_std": 0.49601587653160095, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998071789741516, "sampling/importance_sampling_ratio/min": 0.18749530613422394, "sampling/sampling_logp_difference/max": 1.6740014553070068, "sampling/sampling_logp_difference/mean": 0.00793329905718565, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7722.0, "completions/mean_length": 3411.291748046875, "completions/mean_terminated_length": 3203.434814453125, "completions/min_length": 438.0, "completions/min_terminated_length": 438.0, "entropy": 0.458142414689064, "epoch": 0.14459373000639794, "frac_reward_zero_std": 0.0, "grad_norm": 0.09767042505167689, "kl": 0.0019330465293023735, "learning_rate": 9.497342503948025e-07, "loss": 0.0326, "num_tokens": 19117412.0, "reward": 0.8333333730697632, "reward_std": 0.7900222539901733, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001462697982788, "sampling/importance_sampling_ratio/min": 0.39156341552734375, "sampling/sampling_logp_difference/max": 1.3747613430023193, "sampling/sampling_logp_difference/mean": 0.012274973094463348, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6733.0, "completions/max_terminated_length": 6733.0, "completions/mean_length": 2507.70849609375, "completions/mean_terminated_length": 2507.70849609375, "completions/min_length": 626.0, "completions/min_terminated_length": 626.0, "entropy": 0.36839935183525085, "epoch": 0.145233525271913, "frac_reward_zero_std": 0.0, "grad_norm": 0.10818287465369311, "kl": 0.0018093417747877538, "learning_rate": 9.492941769177174e-07, "loss": 0.0059, "num_tokens": 19195485.0, "reward": 1.2916667461395264, "reward_std": 0.7767796516418457, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5143554210662842, "sampling/importance_sampling_ratio/mean": 0.9999744296073914, "sampling/importance_sampling_ratio/min": 0.5813809633255005, "sampling/sampling_logp_difference/max": 0.5423489809036255, "sampling/sampling_logp_difference/mean": 0.010410616174340248, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4537.0, "completions/max_terminated_length": 4537.0, "completions/mean_length": 2718.291748046875, "completions/mean_terminated_length": 2718.291748046875, "completions/min_length": 1239.0, "completions/min_terminated_length": 1239.0, "entropy": 0.3914121612906456, "epoch": 0.14587332053742802, "frac_reward_zero_std": 0.0, "grad_norm": 0.12524679255799936, "kl": 0.002130853448761627, "learning_rate": 9.488522882909957e-07, "loss": -0.1127, "num_tokens": 19282300.0, "reward": 0.2916666865348816, "reward_std": 0.48371022939682007, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.8936865329742432, "sampling/importance_sampling_ratio/mean": 1.0001052618026733, "sampling/importance_sampling_ratio/min": 0.49595868587493896, "sampling/sampling_logp_difference/max": 0.7012626528739929, "sampling/sampling_logp_difference/mean": 0.010546887293457985, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6169.0, "completions/mean_length": 3058.95849609375, "completions/mean_terminated_length": 2835.78271484375, "completions/min_length": 1170.0, "completions/min_terminated_length": 1170.0, "entropy": 0.35029567778110504, "epoch": 0.14651311580294304, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07918957297323162, "kl": 0.0018175425357185304, "learning_rate": 9.484085862998683e-07, "loss": 0.1354, "num_tokens": 19374987.0, "reward": 1.0833333730697632, "reward_std": 0.41387641429901123, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999936044216156, "sampling/importance_sampling_ratio/min": 0.28725138306617737, "sampling/sampling_logp_difference/max": 1.2473976612091064, "sampling/sampling_logp_difference/mean": 0.009482220746576786, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6415.0, "completions/max_terminated_length": 6415.0, "completions/mean_length": 2382.70849609375, "completions/mean_terminated_length": 2382.70849609375, "completions/min_length": 941.0, "completions/min_terminated_length": 941.0, "entropy": 0.21876050904393196, "epoch": 0.1471529110684581, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0896841912083503, "kl": 0.0015965034544933587, "learning_rate": 9.479630727368926e-07, "loss": 0.0269, "num_tokens": 19446668.0, "reward": 1.75, "reward_std": 0.4232262969017029, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5217351913452148, "sampling/importance_sampling_ratio/mean": 0.9998742938041687, "sampling/importance_sampling_ratio/min": 0.45347750186920166, "sampling/sampling_logp_difference/max": 0.7908096313476562, "sampling/sampling_logp_difference/mean": 0.006710171699523926, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5442.0, "completions/max_terminated_length": 5442.0, "completions/mean_length": 2953.166748046875, "completions/mean_terminated_length": 2953.166748046875, "completions/min_length": 1742.0, "completions/min_terminated_length": 1742.0, "entropy": 0.3283969908952713, "epoch": 0.14779270633397312, "frac_reward_zero_std": 0.0, "grad_norm": 0.09099643158421669, "kl": 0.001758955157129094, "learning_rate": 9.475157494019443e-07, "loss": -0.0182, "num_tokens": 19537056.0, "reward": 1.0833333730697632, "reward_std": 0.8443689346313477, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5160913467407227, "sampling/importance_sampling_ratio/mean": 1.0000977516174316, "sampling/importance_sampling_ratio/min": 0.4463011622428894, "sampling/sampling_logp_difference/max": 0.8067613840103149, "sampling/sampling_logp_difference/mean": 0.009525771252810955, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6474.0, "completions/mean_length": 3817.95849609375, "completions/mean_terminated_length": 3627.78271484375, "completions/min_length": 1051.0, "completions/min_terminated_length": 1051.0, "entropy": 0.47098466753959656, "epoch": 0.14843250159948818, "frac_reward_zero_std": 0.0, "grad_norm": 0.10437818054325577, "kl": 0.002702219062484801, "learning_rate": 9.470666181022112e-07, "loss": 0.0105, "num_tokens": 19648503.0, "reward": 0.3333333432674408, "reward_std": 0.5260698199272156, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999904632568359, "sampling/importance_sampling_ratio/min": 0.5809345245361328, "sampling/sampling_logp_difference/max": 0.7015767097473145, "sampling/sampling_logp_difference/mean": 0.012309685349464417, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4450.0, "completions/max_terminated_length": 4450.0, "completions/mean_length": 2672.33349609375, "completions/mean_terminated_length": 2672.33349609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.35702457278966904, "epoch": 0.1490722968650032, "frac_reward_zero_std": 0.0, "grad_norm": 0.08209387423004616, "kl": 0.02824240201152861, "learning_rate": 9.466156806521847e-07, "loss": -0.1165, "num_tokens": 19732455.0, "reward": 1.375, "reward_std": 0.8017482757568359, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.7720260620117188, "sampling/importance_sampling_ratio/mean": 1.000019907951355, "sampling/importance_sampling_ratio/min": 0.6241641640663147, "sampling/sampling_logp_difference/max": 0.5721235275268555, "sampling/sampling_logp_difference/mean": 0.008783277124166489, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7288.0, "completions/mean_length": 4192.25, "completions/mean_terminated_length": 3620.857177734375, "completions/min_length": 1756.0, "completions/min_terminated_length": 1756.0, "entropy": 0.39418409019708633, "epoch": 0.14971209213051823, "frac_reward_zero_std": 0.0, "grad_norm": 0.09045401088427076, "kl": 0.0014344527735374868, "learning_rate": 9.461629388736531e-07, "loss": 0.0571, "num_tokens": 19844269.0, "reward": 0.875, "reward_std": 0.7532514333724976, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.8406940698623657, "sampling/importance_sampling_ratio/mean": 1.0001128911972046, "sampling/importance_sampling_ratio/min": 0.47254857420921326, "sampling/sampling_logp_difference/max": 0.7496147155761719, "sampling/sampling_logp_difference/mean": 0.01093992218375206, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2774.0, "completions/max_terminated_length": 2774.0, "completions/mean_length": 1472.666748046875, "completions/mean_terminated_length": 1472.666748046875, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "entropy": 0.2226812243461609, "epoch": 0.15035188739603328, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09769918132317344, "kl": 0.0017997178074438125, "learning_rate": 9.457083945956947e-07, "loss": -0.0291, "num_tokens": 19895381.0, "reward": 1.125, "reward_std": 0.41331955790519714, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.3118239641189575, "sampling/importance_sampling_ratio/mean": 0.9999702572822571, "sampling/importance_sampling_ratio/min": 0.4718208909034729, "sampling/sampling_logp_difference/max": 0.7511558532714844, "sampling/sampling_logp_difference/mean": 0.006577888038009405, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5945.0, "completions/mean_length": 2883.70849609375, "completions/mean_terminated_length": 2652.9130859375, "completions/min_length": 560.0, "completions/min_terminated_length": 560.0, "entropy": 0.26791413128376007, "epoch": 0.1509916826615483, "frac_reward_zero_std": 0.0, "grad_norm": 0.09024327744215913, "kl": 0.001623491378268227, "learning_rate": 9.452520496546692e-07, "loss": 0.0719, "num_tokens": 19993566.0, "reward": 0.4583333432674408, "reward_std": 0.6504079103469849, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.8209582567214966, "sampling/importance_sampling_ratio/mean": 0.9997727870941162, "sampling/importance_sampling_ratio/min": 0.5257617235183716, "sampling/sampling_logp_difference/max": 0.6429071426391602, "sampling/sampling_logp_difference/mean": 0.008447925560176373, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 6461.0, "completions/mean_length": 4682.5419921875, "completions/mean_terminated_length": 3980.650146484375, "completions/min_length": 1678.0, "completions/min_terminated_length": 1678.0, "entropy": 0.5153336301445961, "epoch": 0.15163147792706333, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07389608846172643, "kl": 0.001604592485819012, "learning_rate": 9.447939058942117e-07, "loss": -0.0528, "num_tokens": 20116275.0, "reward": 0.0833333358168602, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.0416666679084301, "rewards/code_reward/std": 0.20412413775920868, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000050067901611, "sampling/importance_sampling_ratio/min": 0.512904942035675, "sampling/sampling_logp_difference/max": 0.8865945339202881, "sampling/sampling_logp_difference/mean": 0.012914205901324749, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7846.0, "completions/mean_length": 4082.375, "completions/mean_terminated_length": 3903.69580078125, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 0.31475451588630676, "epoch": 0.15227127319257838, "frac_reward_zero_std": 0.0, "grad_norm": 0.07957617848643939, "kl": 0.0013213242636993527, "learning_rate": 9.44333965165224e-07, "loss": 0.0181, "num_tokens": 20242380.0, "reward": 0.5416666865348816, "reward_std": 0.6490218043327332, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000362396240234, "sampling/importance_sampling_ratio/min": 0.4415496289730072, "sampling/sampling_logp_difference/max": 0.8798177242279053, "sampling/sampling_logp_difference/mean": 0.00889237318187952, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5033.0, "completions/max_terminated_length": 5033.0, "completions/mean_length": 2492.0, "completions/mean_terminated_length": 2492.0, "completions/min_length": 1422.0, "completions/min_terminated_length": 1422.0, "entropy": 0.3173511251807213, "epoch": 0.1529110684580934, "frac_reward_zero_std": 0.0, "grad_norm": 0.11308269657883199, "kl": 0.0013647068408317864, "learning_rate": 9.438722293258677e-07, "loss": -0.0921, "num_tokens": 20323308.0, "reward": 1.125, "reward_std": 0.7288650274276733, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999485015869141, "sampling/importance_sampling_ratio/min": 0.28462639451026917, "sampling/sampling_logp_difference/max": 1.2565778493881226, "sampling/sampling_logp_difference/mean": 0.009004051797091961, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5797.0, "completions/max_terminated_length": 5797.0, "completions/mean_length": 2700.875, "completions/mean_terminated_length": 2700.875, "completions/min_length": 1056.0, "completions/min_terminated_length": 1056.0, "entropy": 0.44053562730550766, "epoch": 0.15355086372360843, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07297916505883946, "kl": 0.0033262537326663733, "learning_rate": 9.434087002415569e-07, "loss": 0.0599, "num_tokens": 20397721.0, "reward": 1.0, "reward_std": 0.2519763112068176, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.5048885345458984, "sampling/importance_sampling_ratio/mean": 1.0003113746643066, "sampling/importance_sampling_ratio/min": 0.41277748346328735, "sampling/sampling_logp_difference/max": 0.8848466873168945, "sampling/sampling_logp_difference/mean": 0.0123211070895195, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5968.0, "completions/mean_length": 3084.125, "completions/mean_terminated_length": 2862.04345703125, "completions/min_length": 1544.0, "completions/min_terminated_length": 1544.0, "entropy": 0.2786095477640629, "epoch": 0.15419065898912349, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07820913955623726, "kl": 0.001483403641032055, "learning_rate": 9.429433797849506e-07, "loss": 0.0438, "num_tokens": 20496316.0, "reward": 0.9166666865348816, "reward_std": 0.33247750997543335, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999173283576965, "sampling/importance_sampling_ratio/min": 0.4342159926891327, "sampling/sampling_logp_difference/max": 0.8649272918701172, "sampling/sampling_logp_difference/mean": 0.00770160648971796, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 6735.0, "completions/mean_length": 4241.95849609375, "completions/mean_terminated_length": 3677.666748046875, "completions/min_length": 2272.0, "completions/min_terminated_length": 2272.0, "entropy": 0.3394963666796684, "epoch": 0.1548304542546385, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05562838910159629, "kl": 0.0016246733139269054, "learning_rate": 9.424762698359441e-07, "loss": -0.0114, "num_tokens": 20625427.0, "reward": 0.5416666865348816, "reward_std": 0.4596785008907318, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.5761867761611938, "sampling/importance_sampling_ratio/mean": 0.9999747276306152, "sampling/importance_sampling_ratio/min": 0.15403074026107788, "sampling/sampling_logp_difference/max": 1.870603084564209, "sampling/sampling_logp_difference/mean": 0.009600838646292686, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 7026.0, "completions/mean_length": 4893.70849609375, "completions/mean_terminated_length": 4234.05029296875, "completions/min_length": 2047.0, "completions/min_terminated_length": 2047.0, "entropy": 0.44802406430244446, "epoch": 0.15547024952015356, "frac_reward_zero_std": 0.0, "grad_norm": 0.08368689927627088, "kl": 0.0014865454286336899, "learning_rate": 9.420073722816632e-07, "loss": 0.1102, "num_tokens": 20757892.0, "reward": 0.5833333730697632, "reward_std": 0.794804573059082, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000636577606201, "sampling/importance_sampling_ratio/min": 0.3995599150657654, "sampling/sampling_logp_difference/max": 0.9173915386199951, "sampling/sampling_logp_difference/mean": 0.012463006190955639, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6855.0, "completions/mean_length": 3061.33349609375, "completions/mean_terminated_length": 2838.260986328125, "completions/min_length": 6.0, "completions/min_terminated_length": 6.0, "entropy": 0.33698368445038795, "epoch": 0.1561100447856686, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11835406201860971, "kl": 0.006053567340131849, "learning_rate": 9.415366890164551e-07, "loss": 0.0966, "num_tokens": 20847228.0, "reward": 1.25, "reward_std": 0.40627965331077576, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.638437271118164, "sampling/importance_sampling_ratio/mean": 1.000024676322937, "sampling/importance_sampling_ratio/min": 0.46697524189949036, "sampling/sampling_logp_difference/max": 0.7614790201187134, "sampling/sampling_logp_difference/mean": 0.00840558111667633, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4060.0, "completions/mean_length": 2014.791748046875, "completions/mean_terminated_length": 1746.2174072265625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "entropy": 0.6057399064302444, "epoch": 0.15674984005118361, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.045873256229531104, "kl": 0.0017980406119022518, "learning_rate": 9.410642219418814e-07, "loss": -0.0192, "num_tokens": 20927527.0, "reward": 0.625, "reward_std": 0.1178511306643486, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.814685344696045, "sampling/importance_sampling_ratio/mean": 0.9999256134033203, "sampling/importance_sampling_ratio/min": 0.6963849067687988, "sampling/sampling_logp_difference/max": 0.5959120988845825, "sampling/sampling_logp_difference/mean": 0.009461983107030392, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7404.0, "completions/mean_length": 4489.5419921875, "completions/mean_terminated_length": 4152.95458984375, "completions/min_length": 507.0, "completions/min_terminated_length": 507.0, "entropy": 0.4621380865573883, "epoch": 0.15738963531669867, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06358070352610046, "kl": 0.001682152651483193, "learning_rate": 9.405899729667103e-07, "loss": 0.054, "num_tokens": 21050500.0, "reward": 0.5, "reward_std": 0.4714045226573944, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.6987584829330444, "sampling/importance_sampling_ratio/mean": 1.0001720190048218, "sampling/importance_sampling_ratio/min": 0.5325356721878052, "sampling/sampling_logp_difference/max": 0.6301054954528809, "sampling/sampling_logp_difference/mean": 0.012518037110567093, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7616.0, "completions/mean_length": 3496.5, "completions/mean_terminated_length": 3069.636474609375, "completions/min_length": 996.0, "completions/min_terminated_length": 996.0, "entropy": 0.3084065206348896, "epoch": 0.1580294305822137, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07603121897920348, "kl": 0.0013958429917693138, "learning_rate": 9.401139440069089e-07, "loss": 0.0431, "num_tokens": 21153504.0, "reward": 1.375, "reward_std": 0.48371022939682007, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.823132038116455, "sampling/importance_sampling_ratio/mean": 1.000075340270996, "sampling/importance_sampling_ratio/min": 0.4789339005947113, "sampling/sampling_logp_difference/max": 0.7361927032470703, "sampling/sampling_logp_difference/mean": 0.008643370121717453, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 6368.0, "completions/mean_length": 3921.291748046875, "completions/mean_terminated_length": 3311.1904296875, "completions/min_length": 1329.0, "completions/min_terminated_length": 1329.0, "entropy": 0.41618896275758743, "epoch": 0.15866922584772872, "frac_reward_zero_std": 0.0, "grad_norm": 0.09515586036504452, "kl": 0.0015366743318736553, "learning_rate": 9.396361369856354e-07, "loss": 0.1296, "num_tokens": 21258095.0, "reward": 0.625, "reward_std": 0.6207113265991211, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.766206979751587, "sampling/importance_sampling_ratio/mean": 0.9999729990959167, "sampling/importance_sampling_ratio/min": 0.6304970383644104, "sampling/sampling_logp_difference/max": 0.5688343048095703, "sampling/sampling_logp_difference/mean": 0.011662821285426617, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6695.0, "completions/max_terminated_length": 6695.0, "completions/mean_length": 3278.08349609375, "completions/mean_terminated_length": 3278.08349609375, "completions/min_length": 1381.0, "completions/min_terminated_length": 1381.0, "entropy": 0.3023187890648842, "epoch": 0.15930902111324377, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09010715460562958, "kl": 0.0015112101973500103, "learning_rate": 9.391565538332316e-07, "loss": 0.0985, "num_tokens": 21359761.0, "reward": 0.875, "reward_std": 0.3698274493217468, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.7461363077163696, "sampling/importance_sampling_ratio/mean": 0.9999722838401794, "sampling/importance_sampling_ratio/min": 0.4854552149772644, "sampling/sampling_logp_difference/max": 0.7226682305335999, "sampling/sampling_logp_difference/mean": 0.008920231834053993, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7650.0, "completions/mean_length": 2817.166748046875, "completions/mean_terminated_length": 2583.478271484375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9074975699186325, "epoch": 0.1599488163787588, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1081426270511214, "kl": 0.004638458864064887, "learning_rate": 9.386751964872143e-07, "loss": -0.0503, "num_tokens": 21457461.0, "reward": 0.5416666865348816, "reward_std": 0.3698274493217468, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.3333333432674408, "rewards/code_reward/std": 0.4815433919429779, "sampling/importance_sampling_ratio/max": 1.7777073383331299, "sampling/importance_sampling_ratio/mean": 0.9996886253356934, "sampling/importance_sampling_ratio/min": 0.6141208410263062, "sampling/sampling_logp_difference/max": 0.575324535369873, "sampling/sampling_logp_difference/mean": 0.012901335954666138, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6866.0, "completions/max_terminated_length": 6866.0, "completions/mean_length": 3905.33349609375, "completions/mean_terminated_length": 3905.33349609375, "completions/min_length": 2192.0, "completions/min_terminated_length": 2192.0, "entropy": 0.4588124006986618, "epoch": 0.16058861164427382, "frac_reward_zero_std": 0.0, "grad_norm": 0.09503619350622873, "kl": 0.0017405230028089136, "learning_rate": 9.381920668922687e-07, "loss": 0.0672, "num_tokens": 21564013.0, "reward": 1.2916667461395264, "reward_std": 0.7109179496765137, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.7926965951919556, "sampling/importance_sampling_ratio/mean": 1.0000442266464233, "sampling/importance_sampling_ratio/min": 0.3712010085582733, "sampling/sampling_logp_difference/max": 0.9910116195678711, "sampling/sampling_logp_difference/mean": 0.011961910873651505, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5639.0, "completions/max_terminated_length": 5639.0, "completions/mean_length": 2853.08349609375, "completions/mean_terminated_length": 2853.08349609375, "completions/min_length": 1354.0, "completions/min_terminated_length": 1354.0, "entropy": 0.38031429052352905, "epoch": 0.16122840690978887, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.061922545559730514, "kl": 0.001993089070310816, "learning_rate": 9.377071670002397e-07, "loss": -0.0134, "num_tokens": 21647959.0, "reward": 0.4166666865348816, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.700966477394104, "sampling/importance_sampling_ratio/mean": 1.0000778436660767, "sampling/importance_sampling_ratio/min": 0.43392372131347656, "sampling/sampling_logp_difference/max": 0.8348865509033203, "sampling/sampling_logp_difference/mean": 0.01082542072981596, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4160.0, "completions/max_terminated_length": 4160.0, "completions/mean_length": 1818.25, "completions/mean_terminated_length": 1818.25, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.4121159724891186, "epoch": 0.1618682021753039, "frac_reward_zero_std": 0.0, "grad_norm": 0.11951053851298536, "kl": 0.0018040471186395735, "learning_rate": 9.372204987701238e-07, "loss": 0.0962, "num_tokens": 21703797.0, "reward": 0.9166666865348816, "reward_std": 0.5443089604377747, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.407352328300476, "sampling/importance_sampling_ratio/mean": 1.0000728368759155, "sampling/importance_sampling_ratio/min": 0.6978182792663574, "sampling/sampling_logp_difference/max": 0.35979652404785156, "sampling/sampling_logp_difference/mean": 0.008998434990644455, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6782.0, "completions/mean_length": 3084.33349609375, "completions/mean_terminated_length": 2862.260986328125, "completions/min_length": 1082.0, "completions/min_terminated_length": 1082.0, "entropy": 0.3282938674092293, "epoch": 0.16250799744081892, "frac_reward_zero_std": 0.0, "grad_norm": 0.10086771174305138, "kl": 0.0015595334116369486, "learning_rate": 9.36732064168062e-07, "loss": 0.0217, "num_tokens": 21806165.0, "reward": 0.8333333730697632, "reward_std": 0.5748276710510254, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.625619888305664, "sampling/importance_sampling_ratio/mean": 1.000005841255188, "sampling/importance_sampling_ratio/min": 0.3335414528846741, "sampling/sampling_logp_difference/max": 1.0979881286621094, "sampling/sampling_logp_difference/mean": 0.00939197838306427, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7365.0, "completions/max_terminated_length": 7365.0, "completions/mean_length": 2840.58349609375, "completions/mean_terminated_length": 2840.58349609375, "completions/min_length": 1010.0, "completions/min_terminated_length": 1010.0, "entropy": 0.38963591307401657, "epoch": 0.16314779270633398, "frac_reward_zero_std": 0.0, "grad_norm": 0.1003280723867196, "kl": 0.0018757306097541004, "learning_rate": 9.362418651673316e-07, "loss": -0.0777, "num_tokens": 21888467.0, "reward": 0.9166666865348816, "reward_std": 0.5443089604377747, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000813007354736, "sampling/importance_sampling_ratio/min": 0.6298624277114868, "sampling/sampling_logp_difference/max": 0.7505922317504883, "sampling/sampling_logp_difference/mean": 0.010346990078687668, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5293.0, "completions/max_terminated_length": 5293.0, "completions/mean_length": 2867.5, "completions/mean_terminated_length": 2867.5, "completions/min_length": 970.0, "completions/min_terminated_length": 970.0, "entropy": 0.30795469880104065, "epoch": 0.163787587971849, "frac_reward_zero_std": 0.0, "grad_norm": 0.09474839743450239, "kl": 0.0016073662554845214, "learning_rate": 9.357499037483376e-07, "loss": -0.0005, "num_tokens": 21974343.0, "reward": 0.3333333432674408, "reward_std": 0.5440332293510437, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6835378408432007, "sampling/importance_sampling_ratio/mean": 1.0001320838928223, "sampling/importance_sampling_ratio/min": 0.2566823363304138, "sampling/sampling_logp_difference/max": 1.3599159717559814, "sampling/sampling_logp_difference/mean": 0.00863790325820446, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7344.0, "completions/max_terminated_length": 7344.0, "completions/mean_length": 3366.625, "completions/mean_terminated_length": 3366.625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3439633175730705, "epoch": 0.16442738323736406, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05131301061384012, "kl": 0.01485598721774295, "learning_rate": 9.352561818986054e-07, "loss": -0.0099, "num_tokens": 22074550.0, "reward": 0.9583333730697632, "reward_std": 0.5660459995269775, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.6931260824203491, "sampling/importance_sampling_ratio/mean": 1.0000618696212769, "sampling/importance_sampling_ratio/min": 0.6422981023788452, "sampling/sampling_logp_difference/max": 0.5265765190124512, "sampling/sampling_logp_difference/mean": 0.009848318994045258, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2766.0, "completions/max_terminated_length": 2766.0, "completions/mean_length": 1967.541748046875, "completions/mean_terminated_length": 1967.541748046875, "completions/min_length": 1164.0, "completions/min_terminated_length": 1164.0, "entropy": 0.18171357735991478, "epoch": 0.16506717850287908, "frac_reward_zero_std": 0.0, "grad_norm": 0.07788219835338178, "kl": 0.0019956890027970076, "learning_rate": 9.347607016127727e-07, "loss": -0.0219, "num_tokens": 22153219.0, "reward": 0.875, "reward_std": 0.817124605178833, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.9117705821990967, "sampling/importance_sampling_ratio/mean": 1.0002907514572144, "sampling/importance_sampling_ratio/min": 0.6423425078392029, "sampling/sampling_logp_difference/max": 0.6480298042297363, "sampling/sampling_logp_difference/mean": 0.005130046978592873, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5917.0, "completions/max_terminated_length": 5917.0, "completions/mean_length": 3095.416748046875, "completions/mean_terminated_length": 3095.416748046875, "completions/min_length": 1207.0, "completions/min_terminated_length": 1207.0, "entropy": 0.3646044209599495, "epoch": 0.1657069737683941, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09030069919609994, "kl": 0.0016813817201182246, "learning_rate": 9.342634648925811e-07, "loss": -0.0296, "num_tokens": 22239333.0, "reward": 0.75, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000615119934082, "sampling/importance_sampling_ratio/min": 0.5595720410346985, "sampling/sampling_logp_difference/max": 0.8089079856872559, "sampling/sampling_logp_difference/mean": 0.00993109680712223, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8104.0, "completions/max_terminated_length": 8104.0, "completions/mean_length": 4049.416748046875, "completions/mean_terminated_length": 4049.416748046875, "completions/min_length": 1570.0, "completions/min_terminated_length": 1570.0, "entropy": 0.39129963517189026, "epoch": 0.16634676903390916, "frac_reward_zero_std": 0.0, "grad_norm": 0.08968232169317754, "kl": 0.0015355687937699258, "learning_rate": 9.337644737468681e-07, "loss": -0.063, "num_tokens": 22349735.0, "reward": 0.9583333730697632, "reward_std": 0.6542876362800598, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.620063304901123, "sampling/importance_sampling_ratio/mean": 0.9998555183410645, "sampling/importance_sampling_ratio/min": 0.06393441557884216, "sampling/sampling_logp_difference/max": 2.7498974800109863, "sampling/sampling_logp_difference/mean": 0.010814624838531017, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5424.0, "completions/max_terminated_length": 5424.0, "completions/mean_length": 2829.041748046875, "completions/mean_terminated_length": 2829.041748046875, "completions/min_length": 1457.0, "completions/min_terminated_length": 1457.0, "entropy": 0.3033542037010193, "epoch": 0.16698656429942418, "frac_reward_zero_std": 0.0, "grad_norm": 0.08761477618057623, "kl": 0.0015281613450497389, "learning_rate": 9.332637301915592e-07, "loss": -0.0317, "num_tokens": 22432936.0, "reward": 1.4166667461395264, "reward_std": 0.7962853312492371, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5159127712249756, "sampling/importance_sampling_ratio/mean": 0.9999940395355225, "sampling/importance_sampling_ratio/min": 0.38667038083076477, "sampling/sampling_logp_difference/max": 0.9501826763153076, "sampling/sampling_logp_difference/mean": 0.009002018719911575, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7783.0, "completions/mean_length": 3571.45849609375, "completions/mean_terminated_length": 2911.381103515625, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.9952653683722019, "epoch": 0.1676263595649392, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0762873450422337, "kl": 0.003346328332554549, "learning_rate": 9.3276123624966e-07, "loss": 0.1083, "num_tokens": 22570875.0, "reward": 0.6666666865348816, "reward_std": 0.48678088188171387, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.6807352304458618, "sampling/importance_sampling_ratio/mean": 1.0000602006912231, "sampling/importance_sampling_ratio/min": 0.3474818170070648, "sampling/sampling_logp_difference/max": 1.0570428371429443, "sampling/sampling_logp_difference/mean": 0.007691946811974049, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7697.0, "completions/mean_length": 4426.375, "completions/mean_terminated_length": 3888.4287109375, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "entropy": 0.47734443098306656, "epoch": 0.16826615483045426, "frac_reward_zero_std": 0.0, "grad_norm": 0.09273068356245939, "kl": 0.0017781639762688428, "learning_rate": 9.32256993951247e-07, "loss": -0.0455, "num_tokens": 22691268.0, "reward": 0.5416666865348816, "reward_std": 0.6850278377532959, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000131368637085, "sampling/importance_sampling_ratio/min": 0.22672848403453827, "sampling/sampling_logp_difference/max": 1.4840021133422852, "sampling/sampling_logp_difference/mean": 0.013649429194629192, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6590.0, "completions/mean_length": 3259.25, "completions/mean_terminated_length": 3044.78271484375, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 0.3535241112112999, "epoch": 0.1689059500959693, "frac_reward_zero_std": 0.0, "grad_norm": 0.09666574463780175, "kl": 0.0015451435465365648, "learning_rate": 9.317510053334603e-07, "loss": 0.1029, "num_tokens": 22784194.0, "reward": 0.875, "reward_std": 0.580761194229126, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.9084163904190063, "sampling/importance_sampling_ratio/mean": 0.9999640583992004, "sampling/importance_sampling_ratio/min": 0.5339730381965637, "sampling/sampling_logp_difference/max": 0.6462737321853638, "sampling/sampling_logp_difference/mean": 0.0097472770139575, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7200.0, "completions/mean_length": 3519.125, "completions/mean_terminated_length": 3315.95654296875, "completions/min_length": 1485.0, "completions/min_terminated_length": 1485.0, "entropy": 0.3762747123837471, "epoch": 0.1695457453614843, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08512531484730837, "kl": 0.002196218090830371, "learning_rate": 9.312432724404956e-07, "loss": 0.088, "num_tokens": 22884877.0, "reward": 0.7083333730697632, "reward_std": 0.4023112952709198, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999319911003113, "sampling/importance_sampling_ratio/min": 0.18065276741981506, "sampling/sampling_logp_difference/max": 1.7111785411834717, "sampling/sampling_logp_difference/mean": 0.0104663772508502, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8122.0, "completions/max_terminated_length": 8122.0, "completions/mean_length": 3446.666748046875, "completions/mean_terminated_length": 3446.666748046875, "completions/min_length": 1142.0, "completions/min_terminated_length": 1142.0, "entropy": 0.36739184707403183, "epoch": 0.17018554062699937, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10407554523925514, "kl": 0.0016269604384433478, "learning_rate": 9.307337973235949e-07, "loss": -0.0643, "num_tokens": 22985357.0, "reward": 0.9166666865348816, "reward_std": 0.651816725730896, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.9785279035568237, "sampling/importance_sampling_ratio/mean": 1.0000628232955933, "sampling/importance_sampling_ratio/min": 0.569318950176239, "sampling/sampling_logp_difference/max": 0.6823530197143555, "sampling/sampling_logp_difference/mean": 0.010417378507554531, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5176.0, "completions/mean_length": 3031.916748046875, "completions/mean_terminated_length": 2807.565185546875, "completions/min_length": 845.0, "completions/min_terminated_length": 845.0, "entropy": 0.2750975862145424, "epoch": 0.1708253358925144, "frac_reward_zero_std": 0.0, "grad_norm": 0.08793087192401537, "kl": 0.0016192662296816707, "learning_rate": 9.30222582041039e-07, "loss": 0.0294, "num_tokens": 23079779.0, "reward": 1.1666667461395264, "reward_std": 0.8869583606719971, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001288652420044, "sampling/importance_sampling_ratio/min": 0.36791980266571045, "sampling/sampling_logp_difference/max": 0.9998903274536133, "sampling/sampling_logp_difference/mean": 0.008248304016888142, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4308.0, "completions/max_terminated_length": 4308.0, "completions/mean_length": 2496.33349609375, "completions/mean_terminated_length": 2496.33349609375, "completions/min_length": 976.0, "completions/min_terminated_length": 976.0, "entropy": 0.29814114421606064, "epoch": 0.17146513115802944, "frac_reward_zero_std": 0.0, "grad_norm": 0.09596556666900243, "kl": 0.00195427707512863, "learning_rate": 9.297096286581388e-07, "loss": -0.0207, "num_tokens": 23158315.0, "reward": 1.0416667461395264, "reward_std": 0.6712342500686646, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4316015243530273, "sampling/importance_sampling_ratio/mean": 1.0000706911087036, "sampling/importance_sampling_ratio/min": 0.5529099702835083, "sampling/sampling_logp_difference/max": 0.5925600528717041, "sampling/sampling_logp_difference/mean": 0.00866561010479927, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6381.0, "completions/mean_length": 4197.45849609375, "completions/mean_terminated_length": 4023.78271484375, "completions/min_length": 1819.0, "completions/min_terminated_length": 1819.0, "entropy": 0.4825332686305046, "epoch": 0.17210492642354447, "frac_reward_zero_std": 0.0, "grad_norm": 0.19686367482331402, "kl": 0.002109659108100459, "learning_rate": 9.291949392472276e-07, "loss": -0.0209, "num_tokens": 23272846.0, "reward": 0.5, "reward_std": 0.5986984968185425, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000069260597229, "sampling/importance_sampling_ratio/min": 0.4380483329296112, "sampling/sampling_logp_difference/max": 0.8254261016845703, "sampling/sampling_logp_difference/mean": 0.012698754668235779, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7716.0, "completions/mean_length": 4083.45849609375, "completions/mean_terminated_length": 3709.95458984375, "completions/min_length": 851.0, "completions/min_terminated_length": 851.0, "entropy": 0.5017390698194504, "epoch": 0.1727447216890595, "frac_reward_zero_std": 0.0, "grad_norm": 0.09164423580043485, "kl": 0.0017619787249714136, "learning_rate": 9.286785158876518e-07, "loss": 0.0744, "num_tokens": 23384289.0, "reward": 0.75, "reward_std": 0.5201624631881714, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5262525081634521, "sampling/importance_sampling_ratio/mean": 1.0000590085983276, "sampling/importance_sampling_ratio/min": 0.5832366943359375, "sampling/sampling_logp_difference/max": 0.5391621589660645, "sampling/sampling_logp_difference/mean": 0.013175684958696365, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4795.0, "completions/mean_length": 3030.20849609375, "completions/mean_terminated_length": 2805.78271484375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3504873365163803, "epoch": 0.17338451695457455, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08740313918206231, "kl": 0.009163184557110071, "learning_rate": 9.281603606657631e-07, "loss": -0.0326, "num_tokens": 23493398.0, "reward": 0.5, "reward_std": 0.6649550199508667, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.7855685949325562, "sampling/importance_sampling_ratio/mean": 1.0002983808517456, "sampling/importance_sampling_ratio/min": 0.5957825779914856, "sampling/sampling_logp_difference/max": 0.5797368288040161, "sampling/sampling_logp_difference/mean": 0.00786285474896431, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7477.0, "completions/max_terminated_length": 7477.0, "completions/mean_length": 3375.33349609375, "completions/mean_terminated_length": 3375.33349609375, "completions/min_length": 1771.0, "completions/min_terminated_length": 1771.0, "entropy": 0.3483826145529747, "epoch": 0.17402431222008957, "frac_reward_zero_std": 0.0, "grad_norm": 0.11330676573427581, "kl": 0.0016217615047935396, "learning_rate": 9.276404756749099e-07, "loss": -0.1511, "num_tokens": 23590446.0, "reward": 1.125, "reward_std": 0.7767796516418457, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.5061575174331665, "sampling/importance_sampling_ratio/mean": 1.0000848770141602, "sampling/importance_sampling_ratio/min": 0.3899589478969574, "sampling/sampling_logp_difference/max": 0.941713809967041, "sampling/sampling_logp_difference/mean": 0.009579382836818695, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 4607.0, "completions/mean_length": 3703.75, "completions/mean_terminated_length": 2806.10009765625, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "entropy": 0.4072815775871277, "epoch": 0.1746641074856046, "frac_reward_zero_std": 0.0, "grad_norm": 0.11479457602701557, "kl": 0.001906210440210998, "learning_rate": 9.271188630154289e-07, "loss": 0.1051, "num_tokens": 23690960.0, "reward": 0.5833333730697632, "reward_std": 0.6536394953727722, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000778436660767, "sampling/importance_sampling_ratio/min": 0.3418636620044708, "sampling/sampling_logp_difference/max": 1.073343276977539, "sampling/sampling_logp_difference/mean": 0.010775871574878693, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5029.0, "completions/max_terminated_length": 5029.0, "completions/mean_length": 2324.791748046875, "completions/mean_terminated_length": 2324.791748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.37168189138174057, "epoch": 0.17530390275111965, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.082985926816508, "kl": 0.007366295263636857, "learning_rate": 9.265955247946368e-07, "loss": 0.0274, "num_tokens": 23768427.0, "reward": 0.2083333432674408, "reward_std": 0.29602527618408203, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.2083333283662796, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.8643770217895508, "sampling/importance_sampling_ratio/mean": 1.000276803970337, "sampling/importance_sampling_ratio/min": 0.07978688925504684, "sampling/sampling_logp_difference/max": 2.5283961296081543, "sampling/sampling_logp_difference/mean": 0.006768062245100737, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5394.0, "completions/max_terminated_length": 5394.0, "completions/mean_length": 2038.416748046875, "completions/mean_terminated_length": 2038.416748046875, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "entropy": 0.3426049128174782, "epoch": 0.17594369801663468, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07546958928463682, "kl": 0.0017090850742533803, "learning_rate": 9.260704631268211e-07, "loss": -0.1548, "num_tokens": 23838061.0, "reward": 0.7916666865348816, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000088095664978, "sampling/importance_sampling_ratio/min": 0.2981249690055847, "sampling/sampling_logp_difference/max": 1.210242509841919, "sampling/sampling_logp_difference/mean": 0.009223436005413532, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4985.0, "completions/max_terminated_length": 4985.0, "completions/mean_length": 2008.041748046875, "completions/mean_terminated_length": 2008.041748046875, "completions/min_length": 871.0, "completions/min_terminated_length": 871.0, "entropy": 0.27396035566926, "epoch": 0.1765834932821497, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10141768455323319, "kl": 0.001498931465903297, "learning_rate": 9.255436801332324e-07, "loss": 0.0806, "num_tokens": 23914358.0, "reward": 1.4166667461395264, "reward_std": 0.5443090200424194, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.5629180669784546, "sampling/importance_sampling_ratio/mean": 1.0000289678573608, "sampling/importance_sampling_ratio/min": 0.6719554662704468, "sampling/sampling_logp_difference/max": 0.44655466079711914, "sampling/sampling_logp_difference/mean": 0.00810764729976654, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6210.0, "completions/mean_length": 3159.375, "completions/mean_terminated_length": 2940.565185546875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.23083499819040298, "epoch": 0.17722328854766475, "frac_reward_zero_std": 0.0, "grad_norm": 0.08292101434276983, "kl": 0.029180564946727827, "learning_rate": 9.250151779420754e-07, "loss": 0.0965, "num_tokens": 24009047.0, "reward": 1.3333333730697632, "reward_std": 0.93708336353302, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001401901245117, "sampling/importance_sampling_ratio/min": 0.5209546685218811, "sampling/sampling_logp_difference/max": 1.500412940979004, "sampling/sampling_logp_difference/mean": 0.006528790108859539, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4175.0, "completions/max_terminated_length": 4175.0, "completions/mean_length": 2665.416748046875, "completions/mean_terminated_length": 2665.416748046875, "completions/min_length": 1286.0, "completions/min_terminated_length": 1286.0, "entropy": 0.323912613093853, "epoch": 0.17786308381317978, "frac_reward_zero_std": 0.0, "grad_norm": 0.09370610369502397, "kl": 0.0022266547312028706, "learning_rate": 9.244849586885003e-07, "loss": 0.0079, "num_tokens": 24094297.0, "reward": 1.5416667461395264, "reward_std": 0.6618844270706177, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5894885063171387, "sampling/importance_sampling_ratio/mean": 0.9998388886451721, "sampling/importance_sampling_ratio/min": 0.5059271454811096, "sampling/sampling_logp_difference/max": 0.6813626289367676, "sampling/sampling_logp_difference/mean": 0.00910603441298008, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7713.0, "completions/mean_length": 3415.5, "completions/mean_terminated_length": 3207.826171875, "completions/min_length": 1060.0, "completions/min_terminated_length": 1060.0, "entropy": 0.3603703901171684, "epoch": 0.1785028790786948, "frac_reward_zero_std": 0.0, "grad_norm": 0.0884925109589232, "kl": 0.0023414807219523937, "learning_rate": 9.239530245145943e-07, "loss": 0.039, "num_tokens": 24193941.0, "reward": 0.5416666865348816, "reward_std": 0.7532514333724976, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6800529956817627, "sampling/importance_sampling_ratio/mean": 1.0001212358474731, "sampling/importance_sampling_ratio/min": 0.48133647441864014, "sampling/sampling_logp_difference/max": 0.7311887741088867, "sampling/sampling_logp_difference/mean": 0.010349467396736145, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4951.0, "completions/max_terminated_length": 4951.0, "completions/mean_length": 2463.75, "completions/mean_terminated_length": 2463.75, "completions/min_length": 1375.0, "completions/min_terminated_length": 1375.0, "entropy": 0.2823691815137863, "epoch": 0.17914267434420986, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06812216590475945, "kl": 0.0015676407783757895, "learning_rate": 9.234193775693726e-07, "loss": 0.0691, "num_tokens": 24273831.0, "reward": 1.0416667461395264, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6299070119857788, "sampling/importance_sampling_ratio/mean": 0.9999343752861023, "sampling/importance_sampling_ratio/min": 0.6105230450630188, "sampling/sampling_logp_difference/max": 0.4934391975402832, "sampling/sampling_logp_difference/mean": 0.007973939180374146, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6135.0, "completions/max_terminated_length": 6135.0, "completions/mean_length": 2737.58349609375, "completions/mean_terminated_length": 2737.58349609375, "completions/min_length": 699.0, "completions/min_terminated_length": 699.0, "entropy": 0.3994944840669632, "epoch": 0.17978246960972488, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08261198842461721, "kl": 0.001578761264681816, "learning_rate": 9.228840200087706e-07, "loss": 0.0648, "num_tokens": 24355381.0, "reward": 1.125, "reward_std": 0.42645785212516785, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.981878399848938, "sampling/importance_sampling_ratio/mean": 0.9999663829803467, "sampling/importance_sampling_ratio/min": 0.5048984289169312, "sampling/sampling_logp_difference/max": 0.6840450763702393, "sampling/sampling_logp_difference/mean": 0.009553713724017143, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4128.0, "completions/max_terminated_length": 4128.0, "completions/mean_length": 1943.125, "completions/mean_terminated_length": 1943.125, "completions/min_length": 1130.0, "completions/min_terminated_length": 1130.0, "entropy": 0.323208712041378, "epoch": 0.18042226487523993, "frac_reward_zero_std": 0.0, "grad_norm": 0.10352978863218013, "kl": 0.002190224884543568, "learning_rate": 9.22346953995634e-07, "loss": 0.0374, "num_tokens": 24413992.0, "reward": 1.5833333730697632, "reward_std": 0.6658527255058289, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.8360676765441895, "sampling/importance_sampling_ratio/mean": 0.9998777508735657, "sampling/importance_sampling_ratio/min": 0.5772933959960938, "sampling/sampling_logp_difference/max": 0.60762619972229, "sampling/sampling_logp_difference/mean": 0.009493863210082054, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7520.0, "completions/mean_length": 3123.33349609375, "completions/mean_terminated_length": 2902.95654296875, "completions/min_length": 1066.0, "completions/min_terminated_length": 1066.0, "entropy": 0.3481386974453926, "epoch": 0.18106206014075496, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1005235128307633, "kl": 0.0017576328245922923, "learning_rate": 9.218081816997109e-07, "loss": -0.0175, "num_tokens": 24504824.0, "reward": 1.375, "reward_std": 0.4261821210384369, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.4411492347717285, "sampling/importance_sampling_ratio/mean": 0.9998375773429871, "sampling/importance_sampling_ratio/min": 0.35809460282325745, "sampling/sampling_logp_difference/max": 1.0269581079483032, "sampling/sampling_logp_difference/mean": 0.009513337165117264, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7152.0, "completions/max_terminated_length": 7152.0, "completions/mean_length": 4008.291748046875, "completions/mean_terminated_length": 4008.291748046875, "completions/min_length": 1803.0, "completions/min_terminated_length": 1803.0, "entropy": 0.5512836277484894, "epoch": 0.18170185540626999, "frac_reward_zero_std": 0.0, "grad_norm": 0.09526018820427244, "kl": 0.0021294956968631595, "learning_rate": 9.212677052976427e-07, "loss": -0.0612, "num_tokens": 24612255.0, "reward": 0.4166666865348816, "reward_std": 0.5986984968185425, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000461339950562, "sampling/importance_sampling_ratio/min": 0.18870557844638824, "sampling/sampling_logp_difference/max": 1.667567253112793, "sampling/sampling_logp_difference/mean": 0.014249238185584545, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6043.0, "completions/max_terminated_length": 6043.0, "completions/mean_length": 3793.95849609375, "completions/mean_terminated_length": 3793.95849609375, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "entropy": 0.4924745559692383, "epoch": 0.18234165067178504, "frac_reward_zero_std": 0.0, "grad_norm": 0.09050609279422052, "kl": 0.001882044511148706, "learning_rate": 9.207255269729558e-07, "loss": 0.0413, "num_tokens": 24716390.0, "reward": 0.8333333730697632, "reward_std": 0.7547006607055664, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4557065963745117, "sampling/importance_sampling_ratio/mean": 0.999973475933075, "sampling/importance_sampling_ratio/min": 0.34705108404159546, "sampling/sampling_logp_difference/max": 1.0582833290100098, "sampling/sampling_logp_difference/mean": 0.012755285017192364, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5550.0, "completions/max_terminated_length": 5550.0, "completions/mean_length": 2103.416748046875, "completions/mean_terminated_length": 2103.416748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3949772045016289, "epoch": 0.18298144593730006, "frac_reward_zero_std": 0.0, "grad_norm": 0.10201957493648214, "kl": 0.06088807716150768, "learning_rate": 9.201816489160516e-07, "loss": -0.0396, "num_tokens": 24785056.0, "reward": 0.75, "reward_std": 0.7452402114868164, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.637352466583252, "sampling/importance_sampling_ratio/mean": 1.0001139640808105, "sampling/importance_sampling_ratio/min": 0.10548253357410431, "sampling/sampling_logp_difference/max": 2.2492098808288574, "sampling/sampling_logp_difference/mean": 0.011083832010626793, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6571.0, "completions/max_terminated_length": 6571.0, "completions/mean_length": 2904.625, "completions/mean_terminated_length": 2904.625, "completions/min_length": 1698.0, "completions/min_terminated_length": 1698.0, "entropy": 0.3259884789586067, "epoch": 0.1836212412028151, "frac_reward_zero_std": 0.0, "grad_norm": 0.08666110706615753, "kl": 0.0017038314836099744, "learning_rate": 9.196360733241992e-07, "loss": 0.0995, "num_tokens": 24872711.0, "reward": 0.7083333730697632, "reward_std": 0.7286013960838318, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.5702507495880127, "sampling/importance_sampling_ratio/mean": 0.9999687671661377, "sampling/importance_sampling_ratio/min": 0.4888106882572174, "sampling/sampling_logp_difference/max": 0.7157800197601318, "sampling/sampling_logp_difference/mean": 0.00903867557644844, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5596.0, "completions/max_terminated_length": 5596.0, "completions/mean_length": 2802.416748046875, "completions/mean_terminated_length": 2802.416748046875, "completions/min_length": 5.0, "completions/min_terminated_length": 5.0, "entropy": 0.4371255859732628, "epoch": 0.18426103646833014, "frac_reward_zero_std": 0.0, "grad_norm": 0.10424577107407852, "kl": 0.01137325627496466, "learning_rate": 9.190888024015251e-07, "loss": -0.0443, "num_tokens": 24964049.0, "reward": 0.75, "reward_std": 0.7957234382629395, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.616662621498108, "sampling/importance_sampling_ratio/mean": 0.9999797344207764, "sampling/importance_sampling_ratio/min": 0.5580862760543823, "sampling/sampling_logp_difference/max": 0.5832417011260986, "sampling/sampling_logp_difference/mean": 0.010753829032182693, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5195.0, "completions/max_terminated_length": 5195.0, "completions/mean_length": 2092.08349609375, "completions/mean_terminated_length": 2092.08349609375, "completions/min_length": 1130.0, "completions/min_terminated_length": 1130.0, "entropy": 0.3421320468187332, "epoch": 0.18490083173384517, "frac_reward_zero_std": 0.0, "grad_norm": 0.10922925974534506, "kl": 0.0024943555472418666, "learning_rate": 9.185398383590055e-07, "loss": -0.0353, "num_tokens": 25024955.0, "reward": 1.2083333730697632, "reward_std": 0.7194125056266785, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.2830907106399536, "sampling/importance_sampling_ratio/mean": 1.0000594854354858, "sampling/importance_sampling_ratio/min": 0.6629406213760376, "sampling/sampling_logp_difference/max": 0.4110698699951172, "sampling/sampling_logp_difference/mean": 0.009234396740794182, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6685.0, "completions/max_terminated_length": 6685.0, "completions/mean_length": 2237.45849609375, "completions/mean_terminated_length": 2237.45849609375, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "entropy": 0.3044307604432106, "epoch": 0.1855406269993602, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08998927848364627, "kl": 0.0018030825885944068, "learning_rate": 9.179891834144564e-07, "loss": -0.0669, "num_tokens": 25095502.0, "reward": 1.1666667461395264, "reward_std": 0.2903675436973572, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.4447531700134277, "sampling/importance_sampling_ratio/mean": 0.9999199509620667, "sampling/importance_sampling_ratio/min": 0.5197436213493347, "sampling/sampling_logp_difference/max": 0.6544196605682373, "sampling/sampling_logp_difference/mean": 0.008468786254525185, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7248.0, "completions/max_terminated_length": 7248.0, "completions/mean_length": 4111.20849609375, "completions/mean_terminated_length": 4111.20849609375, "completions/min_length": 1593.0, "completions/min_terminated_length": 1593.0, "entropy": 0.42200690507888794, "epoch": 0.18618042226487524, "frac_reward_zero_std": 0.0, "grad_norm": 0.08686421813893427, "kl": 0.0021412461646832526, "learning_rate": 9.174368397925253e-07, "loss": 0.0344, "num_tokens": 25208619.0, "reward": 1.2083333730697632, "reward_std": 0.8450170755386353, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000009536743164, "sampling/importance_sampling_ratio/min": 0.4804781377315521, "sampling/sampling_logp_difference/max": 1.5413613319396973, "sampling/sampling_logp_difference/mean": 0.011665033176541328, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7839.0, "completions/max_terminated_length": 7839.0, "completions/mean_length": 3502.166748046875, "completions/mean_terminated_length": 3502.166748046875, "completions/min_length": 1633.0, "completions/min_terminated_length": 1633.0, "entropy": 0.2721410468220711, "epoch": 0.18682021753039027, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0910152458826347, "kl": 0.0015748137084301561, "learning_rate": 9.168828097246816e-07, "loss": -0.1331, "num_tokens": 25316983.0, "reward": 0.7083333730697632, "reward_std": 0.6139818429946899, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5292894840240479, "sampling/importance_sampling_ratio/mean": 1.0001035928726196, "sampling/importance_sampling_ratio/min": 0.5581154823303223, "sampling/sampling_logp_difference/max": 0.5831894278526306, "sampling/sampling_logp_difference/mean": 0.008290046826004982, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5536.0, "completions/max_terminated_length": 5536.0, "completions/mean_length": 2792.83349609375, "completions/mean_terminated_length": 2792.83349609375, "completions/min_length": 1513.0, "completions/min_terminated_length": 1513.0, "entropy": 0.2696964740753174, "epoch": 0.18746001279590532, "frac_reward_zero_std": 0.0, "grad_norm": 0.0873193966971344, "kl": 0.001794331066776067, "learning_rate": 9.163270954492087e-07, "loss": 0.0651, "num_tokens": 25402459.0, "reward": 0.75, "reward_std": 0.7920509576797485, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000816583633423, "sampling/importance_sampling_ratio/min": 0.1825205236673355, "sampling/sampling_logp_difference/max": 1.700892686843872, "sampling/sampling_logp_difference/mean": 0.00813740398734808, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6283.0, "completions/max_terminated_length": 6283.0, "completions/mean_length": 2701.916748046875, "completions/mean_terminated_length": 2701.916748046875, "completions/min_length": 1000.0, "completions/min_terminated_length": 1000.0, "entropy": 0.4217047840356827, "epoch": 0.18809980806142035, "frac_reward_zero_std": 0.0, "grad_norm": 0.10699122236213775, "kl": 0.002265110146254301, "learning_rate": 9.157696992111934e-07, "loss": 0.0475, "num_tokens": 25483985.0, "reward": 0.9166666865348816, "reward_std": 0.71599280834198, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.614693284034729, "sampling/importance_sampling_ratio/mean": 0.9999308586120605, "sampling/importance_sampling_ratio/min": 0.20942914485931396, "sampling/sampling_logp_difference/max": 1.5633697509765625, "sampling/sampling_logp_difference/mean": 0.011649033054709435, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2841.0, "completions/max_terminated_length": 2841.0, "completions/mean_length": 1958.375, "completions/mean_terminated_length": 1958.375, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 0.33340053260326385, "epoch": 0.18873960332693537, "frac_reward_zero_std": 0.0, "grad_norm": 0.10244671819650701, "kl": 0.003170585259795189, "learning_rate": 9.152106232625179e-07, "loss": -0.02, "num_tokens": 25548498.0, "reward": 0.625, "reward_std": 0.8180223703384399, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.5966644287109375, "sampling/importance_sampling_ratio/mean": 1.0000003576278687, "sampling/importance_sampling_ratio/min": 0.5476321578025818, "sampling/sampling_logp_difference/max": 0.6021513938903809, "sampling/sampling_logp_difference/mean": 0.008745279163122177, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6608.0, "completions/max_terminated_length": 6608.0, "completions/mean_length": 2969.95849609375, "completions/mean_terminated_length": 2969.95849609375, "completions/min_length": 1538.0, "completions/min_terminated_length": 1538.0, "entropy": 0.31061506271362305, "epoch": 0.18937939859245043, "frac_reward_zero_std": 0.0, "grad_norm": 0.09535341721995619, "kl": 0.00211804112768732, "learning_rate": 9.146498698618506e-07, "loss": -0.1515, "num_tokens": 25632897.0, "reward": 1.125, "reward_std": 0.8389447927474976, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.8377368450164795, "sampling/importance_sampling_ratio/mean": 0.9999856948852539, "sampling/importance_sampling_ratio/min": 0.398787260055542, "sampling/sampling_logp_difference/max": 0.9193272590637207, "sampling/sampling_logp_difference/mean": 0.009150621481239796, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7995.0, "completions/mean_length": 3966.166748046875, "completions/mean_terminated_length": 3582.0, "completions/min_length": 1582.0, "completions/min_terminated_length": 1582.0, "entropy": 0.4238632917404175, "epoch": 0.19001919385796545, "frac_reward_zero_std": 0.0, "grad_norm": 0.09154623061577646, "kl": 0.002039245649939403, "learning_rate": 9.140874412746364e-07, "loss": 0.0038, "num_tokens": 25740341.0, "reward": 1.0, "reward_std": 0.8045431971549988, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434515476227, "sampling/importance_sampling_ratio/max": 1.3431034088134766, "sampling/importance_sampling_ratio/mean": 0.9999013543128967, "sampling/importance_sampling_ratio/min": 0.5407057404518127, "sampling/sampling_logp_difference/max": 0.6148800849914551, "sampling/sampling_logp_difference/mean": 0.01102988701313734, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6517.0, "completions/max_terminated_length": 6517.0, "completions/mean_length": 2442.0, "completions/mean_terminated_length": 2442.0, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.3483918309211731, "epoch": 0.19065898912348048, "frac_reward_zero_std": 0.0, "grad_norm": 0.11321921991055327, "kl": 0.002636057586641982, "learning_rate": 9.135233397730888e-07, "loss": 0.0612, "num_tokens": 25812453.0, "reward": 0.7916666865348816, "reward_std": 0.9223943948745728, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.7224868535995483, "sampling/importance_sampling_ratio/mean": 0.9998383522033691, "sampling/importance_sampling_ratio/min": 0.5789079666137695, "sampling/sampling_logp_difference/max": 0.5466117858886719, "sampling/sampling_logp_difference/mean": 0.009579210542142391, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5927.0, "completions/max_terminated_length": 5927.0, "completions/mean_length": 4044.45849609375, "completions/mean_terminated_length": 4044.45849609375, "completions/min_length": 1648.0, "completions/min_terminated_length": 1648.0, "entropy": 0.3369429260492325, "epoch": 0.19129878438899553, "frac_reward_zero_std": 0.0, "grad_norm": 0.08971056681398624, "kl": 0.0018213347648270428, "learning_rate": 9.129575676361788e-07, "loss": -0.0536, "num_tokens": 25930472.0, "reward": 0.6666666865348816, "reward_std": 0.7317181825637817, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000296831130981, "sampling/importance_sampling_ratio/min": 0.028150372207164764, "sampling/sampling_logp_difference/max": 3.570194721221924, "sampling/sampling_logp_difference/mean": 0.009325137361884117, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6926.0, "completions/max_terminated_length": 6926.0, "completions/mean_length": 3140.75, "completions/mean_terminated_length": 3140.75, "completions/min_length": 1200.0, "completions/min_terminated_length": 1200.0, "entropy": 0.28468798473477364, "epoch": 0.19193857965451055, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08011464568527085, "kl": 0.0013828831142745912, "learning_rate": 9.123901271496274e-07, "loss": 0.0679, "num_tokens": 26027186.0, "reward": 1.125, "reward_std": 0.5410773754119873, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5716863870620728, "sampling/importance_sampling_ratio/mean": 1.0000649690628052, "sampling/importance_sampling_ratio/min": 0.4258745312690735, "sampling/sampling_logp_difference/max": 0.8536105155944824, "sampling/sampling_logp_difference/mean": 0.00829140655696392, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6029.0, "completions/mean_length": 3193.541748046875, "completions/mean_terminated_length": 2976.217529296875, "completions/min_length": 1076.0, "completions/min_terminated_length": 1076.0, "entropy": 0.27498117834329605, "epoch": 0.19257837492002558, "frac_reward_zero_std": 0.0, "grad_norm": 0.08360385774767459, "kl": 0.0018004390003625304, "learning_rate": 9.11821020605896e-07, "loss": -0.0054, "num_tokens": 26127247.0, "reward": 1.0, "reward_std": 0.8619898557662964, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.4978514909744263, "sampling/importance_sampling_ratio/mean": 1.0001717805862427, "sampling/importance_sampling_ratio/min": 0.5273569822311401, "sampling/sampling_logp_difference/max": 0.6398775577545166, "sampling/sampling_logp_difference/mean": 0.008571052923798561, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4889.0, "completions/max_terminated_length": 4889.0, "completions/mean_length": 2331.70849609375, "completions/mean_terminated_length": 2331.70849609375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.5231466144323349, "epoch": 0.19321817018554063, "frac_reward_zero_std": 0.0, "grad_norm": 0.11303849774076581, "kl": 0.0050256197864655405, "learning_rate": 9.112502503041762e-07, "loss": -0.1282, "num_tokens": 26199856.0, "reward": 0.875, "reward_std": 0.5078567266464233, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.921257734298706, "sampling/importance_sampling_ratio/mean": 1.000076174736023, "sampling/importance_sampling_ratio/min": 0.525313138961792, "sampling/sampling_logp_difference/max": 0.6529800891876221, "sampling/sampling_logp_difference/mean": 0.00927399005740881, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7721.0, "completions/max_terminated_length": 7721.0, "completions/mean_length": 3610.75, "completions/mean_terminated_length": 3610.75, "completions/min_length": 1045.0, "completions/min_terminated_length": 1045.0, "entropy": 0.4001840204000473, "epoch": 0.19385796545105566, "frac_reward_zero_std": 0.0, "grad_norm": 0.08706586321545612, "kl": 0.0016559135110583156, "learning_rate": 9.106778185503815e-07, "loss": -0.0582, "num_tokens": 26303522.0, "reward": 1.5833333730697632, "reward_std": 0.5970091223716736, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.5786644220352173, "sampling/importance_sampling_ratio/mean": 1.0000439882278442, "sampling/importance_sampling_ratio/min": 0.5979683995246887, "sampling/sampling_logp_difference/max": 0.5142173767089844, "sampling/sampling_logp_difference/mean": 0.01082107238471508, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4631.0, "completions/max_terminated_length": 4631.0, "completions/mean_length": 1869.166748046875, "completions/mean_terminated_length": 1869.166748046875, "completions/min_length": 692.0, "completions/min_terminated_length": 692.0, "entropy": 0.29481300711631775, "epoch": 0.1944977607165707, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.06103955449528558, "kl": 0.002433245477732271, "learning_rate": 9.101037276571377e-07, "loss": 0.0379, "num_tokens": 26357910.0, "reward": 1.2916667461395264, "reward_std": 0.1178511306643486, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4672417640686035, "sampling/importance_sampling_ratio/mean": 0.9999199509620667, "sampling/importance_sampling_ratio/min": 0.6450937986373901, "sampling/sampling_logp_difference/max": 0.4383596181869507, "sampling/sampling_logp_difference/mean": 0.008568662218749523, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8170.0, "completions/max_terminated_length": 8170.0, "completions/mean_length": 3446.625, "completions/mean_terminated_length": 3446.625, "completions/min_length": 869.0, "completions/min_terminated_length": 869.0, "entropy": 0.46368713676929474, "epoch": 0.19513755598208574, "frac_reward_zero_std": 0.0, "grad_norm": 0.10225811796974307, "kl": 0.002148979197954759, "learning_rate": 9.095279799437736e-07, "loss": 0.0, "num_tokens": 26456029.0, "reward": 0.6666666865348816, "reward_std": 0.6744658350944519, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4607020616531372, "sampling/importance_sampling_ratio/mean": 1.000153660774231, "sampling/importance_sampling_ratio/min": 0.6046674847602844, "sampling/sampling_logp_difference/max": 0.5030765533447266, "sampling/sampling_logp_difference/mean": 0.012516054324805737, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6056.0, "completions/max_terminated_length": 6056.0, "completions/mean_length": 2615.625, "completions/mean_terminated_length": 2615.625, "completions/min_length": 481.0, "completions/min_terminated_length": 481.0, "entropy": 0.413513146340847, "epoch": 0.19577735124760076, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08188569160392713, "kl": 0.0019956928153987974, "learning_rate": 9.089505777363112e-07, "loss": -0.0235, "num_tokens": 26536780.0, "reward": 1.375, "reward_std": 0.6232070922851562, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.7238205671310425, "sampling/importance_sampling_ratio/mean": 1.000032663345337, "sampling/importance_sampling_ratio/min": 0.5398373007774353, "sampling/sampling_logp_difference/max": 0.6164875030517578, "sampling/sampling_logp_difference/mean": 0.011240921914577484, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7066.0, "completions/max_terminated_length": 7066.0, "completions/mean_length": 2429.125, "completions/mean_terminated_length": 2429.125, "completions/min_length": 1073.0, "completions/min_terminated_length": 1073.0, "entropy": 0.34617411345243454, "epoch": 0.19641714651311581, "frac_reward_zero_std": 0.0, "grad_norm": 0.11143581834216401, "kl": 0.002115447452524677, "learning_rate": 9.083715233674572e-07, "loss": -0.0085, "num_tokens": 26610807.0, "reward": 0.6666666865348816, "reward_std": 0.8079428672790527, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.5146020650863647, "sampling/importance_sampling_ratio/mean": 1.00005042552948, "sampling/importance_sampling_ratio/min": 0.4257308840751648, "sampling/sampling_logp_difference/max": 0.8539478778839111, "sampling/sampling_logp_difference/mean": 0.009547833353281021, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6425.0, "completions/max_terminated_length": 6425.0, "completions/mean_length": 3338.375, "completions/mean_terminated_length": 3338.375, "completions/min_length": 1653.0, "completions/min_terminated_length": 1653.0, "entropy": 0.34986547380685806, "epoch": 0.19705694177863084, "frac_reward_zero_std": 0.0, "grad_norm": 0.08650814958396663, "kl": 0.0017952530761249363, "learning_rate": 9.077908191765924e-07, "loss": -0.0541, "num_tokens": 26709008.0, "reward": 1.4583333730697632, "reward_std": 0.5461008548736572, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000969171524048, "sampling/importance_sampling_ratio/min": 0.23591791093349457, "sampling/sampling_logp_difference/max": 1.444271445274353, "sampling/sampling_logp_difference/mean": 0.009761277586221695, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6048.0, "completions/max_terminated_length": 6048.0, "completions/mean_length": 3186.45849609375, "completions/mean_terminated_length": 3186.45849609375, "completions/min_length": 1012.0, "completions/min_terminated_length": 1012.0, "entropy": 0.35788194090127945, "epoch": 0.19769673704414586, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09384437632801519, "kl": 0.0019536092004273087, "learning_rate": 9.072084675097638e-07, "loss": -0.045, "num_tokens": 26806187.0, "reward": 1.0416667461395264, "reward_std": 0.4493255615234375, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.8381513357162476, "sampling/importance_sampling_ratio/mean": 0.9999485611915588, "sampling/importance_sampling_ratio/min": 0.19940397143363953, "sampling/sampling_logp_difference/max": 1.6124224662780762, "sampling/sampling_logp_difference/mean": 0.009901905432343483, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6370.0, "completions/mean_length": 3363.125, "completions/mean_terminated_length": 3153.174072265625, "completions/min_length": 1006.0, "completions/min_terminated_length": 1006.0, "entropy": 0.45020394027233124, "epoch": 0.19833653230966092, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09667237964548699, "kl": 0.0024682992370799184, "learning_rate": 9.066244707196729e-07, "loss": 0.0283, "num_tokens": 26910126.0, "reward": 0.9166666865348816, "reward_std": 0.39602547883987427, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000619888305664, "sampling/importance_sampling_ratio/min": 0.5642836689949036, "sampling/sampling_logp_difference/max": 0.7640175819396973, "sampling/sampling_logp_difference/mean": 0.01214392390102148, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5770.0, "completions/max_terminated_length": 5770.0, "completions/mean_length": 2685.20849609375, "completions/mean_terminated_length": 2685.20849609375, "completions/min_length": 1571.0, "completions/min_terminated_length": 1571.0, "entropy": 0.3238915205001831, "epoch": 0.19897632757517594, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07591214495046299, "kl": 0.0015920368023216724, "learning_rate": 9.060388311656688e-07, "loss": 0.0416, "num_tokens": 26998259.0, "reward": 0.6666666865348816, "reward_std": 0.48678088188171387, "rewards/accuracy_reward/mean": 0.0833333358168602, "rewards/accuracy_reward/std": 0.28232985734939575, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.6174613237380981, "sampling/importance_sampling_ratio/mean": 1.0001212358474731, "sampling/importance_sampling_ratio/min": 0.4651406705379486, "sampling/sampling_logp_difference/max": 0.7654154300689697, "sampling/sampling_logp_difference/mean": 0.009209757670760155, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6445.0, "completions/max_terminated_length": 6445.0, "completions/mean_length": 2749.041748046875, "completions/mean_terminated_length": 2749.041748046875, "completions/min_length": 894.0, "completions/min_terminated_length": 894.0, "entropy": 0.3306910917162895, "epoch": 0.19961612284069097, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0899851765122722, "kl": 0.0019427434890531003, "learning_rate": 9.054515512137367e-07, "loss": -0.0943, "num_tokens": 27074476.0, "reward": 1.2083333730697632, "reward_std": 0.5175491571426392, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.6872602701187134, "sampling/importance_sampling_ratio/mean": 0.999973475933075, "sampling/importance_sampling_ratio/min": 0.5732381939888, "sampling/sampling_logp_difference/max": 0.5564539432525635, "sampling/sampling_logp_difference/mean": 0.009474346414208412, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7257.0, "completions/mean_length": 3258.75, "completions/mean_terminated_length": 3044.260986328125, "completions/min_length": 1712.0, "completions/min_terminated_length": 1712.0, "entropy": 0.27138666808605194, "epoch": 0.20025591810620602, "frac_reward_zero_std": 0.0, "grad_norm": 0.24367474122434793, "kl": 0.002079624042380601, "learning_rate": 9.04862633236489e-07, "loss": -0.0124, "num_tokens": 27172846.0, "reward": 0.8333333730697632, "reward_std": 0.7513124346733093, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001637935638428, "sampling/importance_sampling_ratio/min": 0.4777749478816986, "sampling/sampling_logp_difference/max": 0.9008181095123291, "sampling/sampling_logp_difference/mean": 0.008070755749940872, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6839.0, "completions/mean_length": 3655.0, "completions/mean_terminated_length": 3457.7392578125, "completions/min_length": 1343.0, "completions/min_terminated_length": 1343.0, "entropy": 0.3595985174179077, "epoch": 0.20089571337172105, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05322058666059468, "kl": 0.0018387524760328233, "learning_rate": 9.04272079613156e-07, "loss": 0.0367, "num_tokens": 27282686.0, "reward": 0.7083333730697632, "reward_std": 0.6504079103469849, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000368356704712, "sampling/importance_sampling_ratio/min": 0.5982042551040649, "sampling/sampling_logp_difference/max": 0.7547240257263184, "sampling/sampling_logp_difference/mean": 0.010204441845417023, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6600.0, "completions/max_terminated_length": 6600.0, "completions/mean_length": 3754.416748046875, "completions/mean_terminated_length": 3754.416748046875, "completions/min_length": 1624.0, "completions/min_terminated_length": 1624.0, "entropy": 0.3147711083292961, "epoch": 0.20153550863723607, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.056011573602802435, "kl": 0.0019553191086743027, "learning_rate": 9.036798927295757e-07, "loss": 0.0363, "num_tokens": 27395928.0, "reward": 1.2916667461395264, "reward_std": 0.5138766169548035, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999527931213379, "sampling/importance_sampling_ratio/min": 0.5544560551643372, "sampling/sampling_logp_difference/max": 0.8423960208892822, "sampling/sampling_logp_difference/mean": 0.00910517293959856, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 8149.0, "completions/mean_length": 4168.75, "completions/mean_terminated_length": 3803.0, "completions/min_length": 1821.0, "completions/min_terminated_length": 1821.0, "entropy": 0.37258492410182953, "epoch": 0.20217530390275112, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08921215999754425, "kl": 0.0017332011193502694, "learning_rate": 9.030860749781846e-07, "loss": 0.0124, "num_tokens": 27507626.0, "reward": 0.6666666865348816, "reward_std": 0.35634833574295044, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5511807203292847, "sampling/importance_sampling_ratio/mean": 0.9999231696128845, "sampling/importance_sampling_ratio/min": 0.4455025792121887, "sampling/sampling_logp_difference/max": 0.8085522651672363, "sampling/sampling_logp_difference/mean": 0.009866038337349892, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8085.0, "completions/mean_length": 3164.25, "completions/mean_terminated_length": 2945.65234375, "completions/min_length": 1026.0, "completions/min_terminated_length": 1026.0, "entropy": 0.4132281392812729, "epoch": 0.20281509916826615, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10537006230717222, "kl": 0.0018183898646384478, "learning_rate": 9.024906287580083e-07, "loss": 0.0319, "num_tokens": 27592568.0, "reward": 1.0, "reward_std": 0.34503278136253357, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.7858556509017944, "sampling/importance_sampling_ratio/mean": 0.9999261498451233, "sampling/importance_sampling_ratio/min": 0.38094645738601685, "sampling/sampling_logp_difference/max": 0.9650964736938477, "sampling/sampling_logp_difference/mean": 0.011071732267737389, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3554.0, "completions/max_terminated_length": 3554.0, "completions/mean_length": 2630.416748046875, "completions/mean_terminated_length": 2630.416748046875, "completions/min_length": 1029.0, "completions/min_terminated_length": 1029.0, "entropy": 0.1749483309686184, "epoch": 0.2034548944337812, "frac_reward_zero_std": 0.0, "grad_norm": 0.08428869253229908, "kl": 0.0015314601478166878, "learning_rate": 9.018935564746508e-07, "loss": 0.0079, "num_tokens": 27687330.0, "reward": 0.8333333730697632, "reward_std": 0.4857778251171112, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5677825212478638, "sampling/importance_sampling_ratio/mean": 0.9999467730522156, "sampling/importance_sampling_ratio/min": 0.43719711899757385, "sampling/sampling_logp_difference/max": 0.8273711204528809, "sampling/sampling_logp_difference/mean": 0.005446727387607098, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4453.0, "completions/max_terminated_length": 4453.0, "completions/mean_length": 2253.45849609375, "completions/mean_terminated_length": 2253.45849609375, "completions/min_length": 570.0, "completions/min_terminated_length": 570.0, "entropy": 0.24064119532704353, "epoch": 0.20409468969929623, "frac_reward_zero_std": 0.0, "grad_norm": 0.10908522614627043, "kl": 0.0019781520823016763, "learning_rate": 9.012948605402856e-07, "loss": 0.0465, "num_tokens": 27755573.0, "reward": 1.0833333730697632, "reward_std": 0.4857778251171112, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002096891403198, "sampling/importance_sampling_ratio/min": 0.6845225095748901, "sampling/sampling_logp_difference/max": 0.8243398666381836, "sampling/sampling_logp_difference/mean": 0.007097387220710516, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3489.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 1962.625, "completions/mean_terminated_length": 1962.625, "completions/min_length": 986.0, "completions/min_terminated_length": 986.0, "entropy": 0.24066360294818878, "epoch": 0.20473448496481125, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08567484355546655, "kl": 0.0016765882028266788, "learning_rate": 9.00694543373646e-07, "loss": -0.0138, "num_tokens": 27824540.0, "reward": 1.5, "reward_std": 0.36585909128189087, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.407688856124878, "sampling/importance_sampling_ratio/mean": 0.9999319911003113, "sampling/importance_sampling_ratio/min": 0.4017452001571655, "sampling/sampling_logp_difference/max": 0.9119372367858887, "sampling/sampling_logp_difference/mean": 0.006835358217358589, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7130.0, "completions/max_terminated_length": 7130.0, "completions/mean_length": 2661.625, "completions/mean_terminated_length": 2661.625, "completions/min_length": 775.0, "completions/min_terminated_length": 775.0, "entropy": 0.3611854836344719, "epoch": 0.2053742802303263, "frac_reward_zero_std": 0.0, "grad_norm": 0.11143263520572011, "kl": 0.002213964704424143, "learning_rate": 9.000926074000148e-07, "loss": -0.0463, "num_tokens": 27900171.0, "reward": 1.25, "reward_std": 0.5989742279052734, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000202655792236, "sampling/importance_sampling_ratio/min": 0.5505219101905823, "sampling/sampling_logp_difference/max": 1.0172717571258545, "sampling/sampling_logp_difference/mean": 0.009899337776005268, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7731.0, "completions/mean_length": 3060.45849609375, "completions/mean_terminated_length": 2593.95458984375, "completions/min_length": 1120.0, "completions/min_terminated_length": 1120.0, "entropy": 0.41807129234075546, "epoch": 0.20601407549584133, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1624667409498663, "kl": 0.00213477952638641, "learning_rate": 8.994890550512151e-07, "loss": 0.116, "num_tokens": 27989238.0, "reward": 0.75, "reward_std": 0.44819486141204834, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.462099313735962, "sampling/importance_sampling_ratio/mean": 1.0000669956207275, "sampling/importance_sampling_ratio/min": 0.5919684171676636, "sampling/sampling_logp_difference/max": 0.5243020057678223, "sampling/sampling_logp_difference/mean": 0.011878753080964088, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6529.0, "completions/max_terminated_length": 6529.0, "completions/mean_length": 3448.25, "completions/mean_terminated_length": 3448.25, "completions/min_length": 1704.0, "completions/min_terminated_length": 1704.0, "entropy": 0.47664082795381546, "epoch": 0.20665387076135636, "frac_reward_zero_std": 0.0, "grad_norm": 0.10353099436175123, "kl": 0.0024750629672780633, "learning_rate": 8.988838887655996e-07, "loss": 0.0364, "num_tokens": 28081636.0, "reward": 0.875, "reward_std": 0.5049939155578613, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998909831047058, "sampling/importance_sampling_ratio/min": 0.4494597017765045, "sampling/sampling_logp_difference/max": 0.7997090816497803, "sampling/sampling_logp_difference/mean": 0.012590980157256126, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7695.0, "completions/max_terminated_length": 7695.0, "completions/mean_length": 2976.08349609375, "completions/mean_terminated_length": 2976.08349609375, "completions/min_length": 1405.0, "completions/min_terminated_length": 1405.0, "entropy": 0.3400093540549278, "epoch": 0.2072936660268714, "frac_reward_zero_std": 0.0, "grad_norm": 0.09997266564004145, "kl": 0.0022724056034348905, "learning_rate": 8.982771109880419e-07, "loss": 0.0155, "num_tokens": 28166502.0, "reward": 1.1666667461395264, "reward_std": 0.747992217540741, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.3747538328170776, "sampling/importance_sampling_ratio/mean": 1.0002471208572388, "sampling/importance_sampling_ratio/min": 0.648089587688446, "sampling/sampling_logp_difference/max": 0.43372637033462524, "sampling/sampling_logp_difference/mean": 0.00987686961889267, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3972.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 2163.58349609375, "completions/mean_terminated_length": 2163.58349609375, "completions/min_length": 717.0, "completions/min_terminated_length": 717.0, "entropy": 0.28378522396087646, "epoch": 0.20793346129238643, "frac_reward_zero_std": 0.0, "grad_norm": 0.0959895104812924, "kl": 0.0021098304132465273, "learning_rate": 8.976687241699257e-07, "loss": -0.0008, "num_tokens": 28234420.0, "reward": 1.7083333730697632, "reward_std": 0.5317275524139404, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998263716697693, "sampling/importance_sampling_ratio/min": 0.43550512194633484, "sampling/sampling_logp_difference/max": 0.8763613700866699, "sampling/sampling_logp_difference/mean": 0.008804067969322205, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6447.0, "completions/max_terminated_length": 6447.0, "completions/mean_length": 2870.58349609375, "completions/mean_terminated_length": 2870.58349609375, "completions/min_length": 455.0, "completions/min_terminated_length": 455.0, "entropy": 0.31819649040699005, "epoch": 0.20857325655790146, "frac_reward_zero_std": 0.0, "grad_norm": 0.08934128588425934, "kl": 0.002086067048367113, "learning_rate": 8.970587307691355e-07, "loss": -0.0955, "num_tokens": 28321130.0, "reward": 1.125, "reward_std": 0.6681799292564392, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.2967795133590698, "sampling/importance_sampling_ratio/mean": 0.9998742938041687, "sampling/importance_sampling_ratio/min": 0.49250873923301697, "sampling/sampling_logp_difference/max": 0.7082430720329285, "sampling/sampling_logp_difference/mean": 0.008433545008301735, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7862.0, "completions/mean_length": 4814.9169921875, "completions/mean_terminated_length": 4507.9091796875, "completions/min_length": 558.0, "completions/min_terminated_length": 558.0, "entropy": 0.49440520256757736, "epoch": 0.2092130518234165, "frac_reward_zero_std": 0.0, "grad_norm": 0.07756356189831629, "kl": 0.0019292469369247556, "learning_rate": 8.964471332500458e-07, "loss": 0.1362, "num_tokens": 28448496.0, "reward": 0.5833333730697632, "reward_std": 0.6381160020828247, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999422430992126, "sampling/importance_sampling_ratio/min": 0.04187333956360817, "sampling/sampling_logp_difference/max": 3.1731059551239014, "sampling/sampling_logp_difference/mean": 0.012837477028369904, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6571.0, "completions/max_terminated_length": 6571.0, "completions/mean_length": 3046.25, "completions/mean_terminated_length": 3046.25, "completions/min_length": 1173.0, "completions/min_terminated_length": 1173.0, "entropy": 0.4126072898507118, "epoch": 0.20985284708893154, "frac_reward_zero_std": 0.0, "grad_norm": 0.08976174440978622, "kl": 0.0019990629225503653, "learning_rate": 8.958339340835127e-07, "loss": 0.1215, "num_tokens": 28535310.0, "reward": 1.0833333730697632, "reward_std": 0.6263689994812012, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.433010220527649, "sampling/importance_sampling_ratio/mean": 0.9999217391014099, "sampling/importance_sampling_ratio/min": 0.5696991682052612, "sampling/sampling_logp_difference/max": 0.5626468658447266, "sampling/sampling_logp_difference/mean": 0.01117711141705513, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6332.0, "completions/max_terminated_length": 6332.0, "completions/mean_length": 2652.166748046875, "completions/mean_terminated_length": 2652.166748046875, "completions/min_length": 1320.0, "completions/min_terminated_length": 1320.0, "entropy": 0.3526384085416794, "epoch": 0.2104926423544466, "frac_reward_zero_std": 0.0, "grad_norm": 0.09396077503057318, "kl": 0.002278032130561769, "learning_rate": 8.952191357468621e-07, "loss": 0.0649, "num_tokens": 28616218.0, "reward": 0.8333333730697632, "reward_std": 0.6536394953727722, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001468658447266, "sampling/importance_sampling_ratio/min": 0.47540396451950073, "sampling/sampling_logp_difference/max": 0.7745447158813477, "sampling/sampling_logp_difference/mean": 0.009760230779647827, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5525.0, "completions/max_terminated_length": 5525.0, "completions/mean_length": 2814.25, "completions/mean_terminated_length": 2814.25, "completions/min_length": 1586.0, "completions/min_terminated_length": 1586.0, "entropy": 0.2412930466234684, "epoch": 0.21113243761996162, "frac_reward_zero_std": 0.0, "grad_norm": 0.11412949233491915, "kl": 0.001501355436630547, "learning_rate": 8.946027407238808e-07, "loss": 0.0711, "num_tokens": 28705800.0, "reward": 1.1666667461395264, "reward_std": 0.6381160020828247, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.000079870223999, "sampling/importance_sampling_ratio/min": 0.4359823763370514, "sampling/sampling_logp_difference/max": 0.8301534652709961, "sampling/sampling_logp_difference/mean": 0.006992570124566555, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4392.0, "completions/max_terminated_length": 4392.0, "completions/mean_length": 2033.2083740234375, "completions/mean_terminated_length": 2033.2083740234375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4336271211504936, "epoch": 0.21177223288547664, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07015875038510005, "kl": 0.04596757798572071, "learning_rate": 8.939847515048064e-07, "loss": -0.0578, "num_tokens": 28771509.0, "reward": 0.4583333432674408, "reward_std": 0.6504079103469849, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5690453052520752, "sampling/importance_sampling_ratio/mean": 1.0000361204147339, "sampling/importance_sampling_ratio/min": 0.5631868243217468, "sampling/sampling_logp_difference/max": 0.5741438865661621, "sampling/sampling_logp_difference/mean": 0.009443956427276134, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4575.0, "completions/max_terminated_length": 4575.0, "completions/mean_length": 1831.75, "completions/mean_terminated_length": 1831.75, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.5008450895547867, "epoch": 0.2124120281509917, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0776519513593394, "kl": 0.0021920192521065474, "learning_rate": 8.93365170586317e-07, "loss": -0.1388, "num_tokens": 28830335.0, "reward": 1.125, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001875162124634, "sampling/importance_sampling_ratio/min": 0.6773057579994202, "sampling/sampling_logp_difference/max": 1.0171995162963867, "sampling/sampling_logp_difference/mean": 0.009322041645646095, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3340.0, "completions/max_terminated_length": 3340.0, "completions/mean_length": 1688.4583740234375, "completions/mean_terminated_length": 1688.4583740234375, "completions/min_length": 918.0, "completions/min_terminated_length": 918.0, "entropy": 0.2988165318965912, "epoch": 0.21305182341650672, "frac_reward_zero_std": 0.0, "grad_norm": 0.10843502279413253, "kl": 0.002204169664764777, "learning_rate": 8.92744000471521e-07, "loss": -0.0085, "num_tokens": 28888218.0, "reward": 1.5, "reward_std": 0.8046225905418396, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.42494797706604, "sampling/importance_sampling_ratio/mean": 1.0002784729003906, "sampling/importance_sampling_ratio/min": 0.7144548296928406, "sampling/sampling_logp_difference/max": 0.35413527488708496, "sampling/sampling_logp_difference/mean": 0.00841646920889616, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6951.0, "completions/max_terminated_length": 6951.0, "completions/mean_length": 2992.5, "completions/mean_terminated_length": 2992.5, "completions/min_length": 1281.0, "completions/min_terminated_length": 1281.0, "entropy": 0.31668607145547867, "epoch": 0.21369161868202174, "frac_reward_zero_std": 0.0, "grad_norm": 0.10840545654779003, "kl": 0.0017543078574817628, "learning_rate": 8.921212436699475e-07, "loss": -0.0454, "num_tokens": 28980246.0, "reward": 1.125, "reward_std": 0.7473440170288086, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.936798334121704, "sampling/importance_sampling_ratio/mean": 0.9999586939811707, "sampling/importance_sampling_ratio/min": 0.3296198844909668, "sampling/sampling_logp_difference/max": 1.1098151206970215, "sampling/sampling_logp_difference/mean": 0.008933575823903084, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7692.0, "completions/max_terminated_length": 7692.0, "completions/mean_length": 3100.0, "completions/mean_terminated_length": 3100.0, "completions/min_length": 916.0, "completions/min_terminated_length": 916.0, "entropy": 0.4524192735552788, "epoch": 0.2143314139475368, "frac_reward_zero_std": 0.0, "grad_norm": 0.12134693488799606, "kl": 0.0021944360923953354, "learning_rate": 8.914969026975353e-07, "loss": -0.217, "num_tokens": 29065134.0, "reward": 0.4583333432674408, "reward_std": 0.5625220537185669, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.609054446220398, "sampling/importance_sampling_ratio/mean": 0.9999575018882751, "sampling/importance_sampling_ratio/min": 0.4336167573928833, "sampling/sampling_logp_difference/max": 0.8355941772460938, "sampling/sampling_logp_difference/mean": 0.012146905064582825, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4005.0, "completions/max_terminated_length": 4005.0, "completions/mean_length": 2018.7083740234375, "completions/mean_terminated_length": 2018.7083740234375, "completions/min_length": 1216.0, "completions/min_terminated_length": 1216.0, "entropy": 0.3796650767326355, "epoch": 0.21497120921305182, "frac_reward_zero_std": 0.0, "grad_norm": 0.10799151109598853, "kl": 0.00264389521908015, "learning_rate": 8.908709800766236e-07, "loss": -0.0092, "num_tokens": 29130447.0, "reward": 1.75, "reward_std": 0.7071067690849304, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999072551727295, "sampling/importance_sampling_ratio/min": 0.5710948705673218, "sampling/sampling_logp_difference/max": 0.7788829803466797, "sampling/sampling_logp_difference/mean": 0.010086225345730782, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7795.0, "completions/max_terminated_length": 7795.0, "completions/mean_length": 3362.791748046875, "completions/mean_terminated_length": 3362.791748046875, "completions/min_length": 1417.0, "completions/min_terminated_length": 1417.0, "entropy": 0.4851776286959648, "epoch": 0.21561100447856685, "frac_reward_zero_std": 0.0, "grad_norm": 0.08998312196660568, "kl": 0.0018569593667052686, "learning_rate": 8.902434783359416e-07, "loss": 0.008, "num_tokens": 29224770.0, "reward": 1.2083333730697632, "reward_std": 0.7356865406036377, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.3853561878204346, "sampling/importance_sampling_ratio/mean": 1.000073790550232, "sampling/importance_sampling_ratio/min": 0.6192420721054077, "sampling/sampling_logp_difference/max": 0.47925901412963867, "sampling/sampling_logp_difference/mean": 0.012909447774291039, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4878.0, "completions/max_terminated_length": 4878.0, "completions/mean_length": 3267.33349609375, "completions/mean_terminated_length": 3267.33349609375, "completions/min_length": 994.0, "completions/min_terminated_length": 994.0, "entropy": 0.3070469871163368, "epoch": 0.2162507997440819, "frac_reward_zero_std": 0.0, "grad_norm": 0.09595535112735093, "kl": 0.0014741544728167355, "learning_rate": 8.896144000105977e-07, "loss": -0.0662, "num_tokens": 29329162.0, "reward": 0.875, "reward_std": 0.7810344696044922, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.9321004152297974, "sampling/importance_sampling_ratio/mean": 1.0000168085098267, "sampling/importance_sampling_ratio/min": 0.48303693532943726, "sampling/sampling_logp_difference/max": 0.727662205696106, "sampling/sampling_logp_difference/mean": 0.008968377485871315, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6316.0, "completions/max_terminated_length": 6316.0, "completions/mean_length": 3023.75, "completions/mean_terminated_length": 3023.75, "completions/min_length": 1658.0, "completions/min_terminated_length": 1658.0, "entropy": 0.2973993569612503, "epoch": 0.21689059500959693, "frac_reward_zero_std": 0.0, "grad_norm": 0.08972791864109061, "kl": 0.0019391021342016757, "learning_rate": 8.889837476420701e-07, "loss": 0.0251, "num_tokens": 29415788.0, "reward": 1.1666667461395264, "reward_std": 0.8924775123596191, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5679999589920044, "sampling/importance_sampling_ratio/mean": 1.0000367164611816, "sampling/importance_sampling_ratio/min": 0.6088368892669678, "sampling/sampling_logp_difference/max": 0.49620485305786133, "sampling/sampling_logp_difference/mean": 0.00845093373209238, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7644.0, "completions/max_terminated_length": 7644.0, "completions/mean_length": 2180.83349609375, "completions/mean_terminated_length": 2180.83349609375, "completions/min_length": 855.0, "completions/min_terminated_length": 855.0, "entropy": 0.2968468964099884, "epoch": 0.21753039027511195, "frac_reward_zero_std": 0.0, "grad_norm": 0.10055089602402388, "kl": 0.0017665588820818812, "learning_rate": 8.883515237781962e-07, "loss": -0.0983, "num_tokens": 29483744.0, "reward": 1.1666667461395264, "reward_std": 0.4993361234664917, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 1.640687346458435, "sampling/importance_sampling_ratio/mean": 1.0000215768814087, "sampling/importance_sampling_ratio/min": 0.4268762469291687, "sampling/sampling_logp_difference/max": 0.8512611389160156, "sampling/sampling_logp_difference/mean": 0.008402559906244278, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6486.0, "completions/max_terminated_length": 6486.0, "completions/mean_length": 2900.83349609375, "completions/mean_terminated_length": 2900.83349609375, "completions/min_length": 1487.0, "completions/min_terminated_length": 1487.0, "entropy": 0.383199006319046, "epoch": 0.218170185540627, "frac_reward_zero_std": 0.0, "grad_norm": 0.09308659473080672, "kl": 0.0018851187487598509, "learning_rate": 8.877177309731616e-07, "loss": 0.0144, "num_tokens": 29573324.0, "reward": 1.0833333730697632, "reward_std": 0.6218419671058655, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.6700643301010132, "sampling/importance_sampling_ratio/mean": 0.9999716877937317, "sampling/importance_sampling_ratio/min": 0.5360302329063416, "sampling/sampling_logp_difference/max": 0.6235647201538086, "sampling/sampling_logp_difference/mean": 0.01095408946275711, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 4705.0, "completions/mean_length": 1996.375, "completions/mean_terminated_length": 1727.0, "completions/min_length": 632.0, "completions/min_terminated_length": 632.0, "entropy": 0.3067284971475601, "epoch": 0.21880998080614203, "frac_reward_zero_std": 0.0, "grad_norm": 0.18858608201412116, "kl": 0.0021819995890837163, "learning_rate": 8.870823717874911e-07, "loss": 0.3028, "num_tokens": 29632437.0, "reward": 1.8333333730697632, "reward_std": 0.4714045226573944, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.9583333134651184, "rewards/code_reward/std": 0.20412415266036987, "sampling/importance_sampling_ratio/max": 1.4358818531036377, "sampling/importance_sampling_ratio/mean": 0.999791145324707, "sampling/importance_sampling_ratio/min": 0.6236868500709534, "sampling/sampling_logp_difference/max": 0.47210693359375, "sampling/sampling_logp_difference/mean": 0.010215556249022484, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3226.0, "completions/max_terminated_length": 3226.0, "completions/mean_length": 1983.0833740234375, "completions/mean_terminated_length": 1983.0833740234375, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 0.25754594802856445, "epoch": 0.21944977607165708, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06969255741548488, "kl": 0.0021538856672123075, "learning_rate": 8.864454487880376e-07, "loss": -0.0457, "num_tokens": 29701959.0, "reward": 0.7083333730697632, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.8284820318222046, "sampling/importance_sampling_ratio/mean": 1.0000454187393188, "sampling/importance_sampling_ratio/min": 0.506969153881073, "sampling/sampling_logp_difference/max": 0.6793050765991211, "sampling/sampling_logp_difference/mean": 0.006929989904165268, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5679.0, "completions/max_terminated_length": 5679.0, "completions/mean_length": 1923.041748046875, "completions/mean_terminated_length": 1923.041748046875, "completions/min_length": 867.0, "completions/min_terminated_length": 867.0, "entropy": 0.27675746381282806, "epoch": 0.2200895713371721, "frac_reward_zero_std": 0.0, "grad_norm": 0.11795814288603323, "kl": 0.001859032578067854, "learning_rate": 8.858069645479715e-07, "loss": -0.1349, "num_tokens": 29765632.0, "reward": 1.25, "reward_std": 0.6257078647613525, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5302616357803345, "sampling/importance_sampling_ratio/mean": 0.9999459385871887, "sampling/importance_sampling_ratio/min": 0.5837854743003845, "sampling/sampling_logp_difference/max": 0.5382217168807983, "sampling/sampling_logp_difference/mean": 0.008084889501333237, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4187.0, "completions/max_terminated_length": 4187.0, "completions/mean_length": 2208.20849609375, "completions/mean_terminated_length": 2208.20849609375, "completions/min_length": 1145.0, "completions/min_terminated_length": 1145.0, "entropy": 0.2603370323777199, "epoch": 0.22072936660268713, "frac_reward_zero_std": 0.0, "grad_norm": 0.09656317051599543, "kl": 0.0018067081400658935, "learning_rate": 8.851669216467707e-07, "loss": 0.002, "num_tokens": 29832093.0, "reward": 1.4166667461395264, "reward_std": 0.7318329811096191, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998429417610168, "sampling/importance_sampling_ratio/min": 0.47733819484710693, "sampling/sampling_logp_difference/max": 0.7448456287384033, "sampling/sampling_logp_difference/mean": 0.008101189509034157, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 8189.0, "completions/mean_length": 4023.166748046875, "completions/mean_terminated_length": 3841.9130859375, "completions/min_length": 1326.0, "completions/min_terminated_length": 1326.0, "entropy": 0.4342878758907318, "epoch": 0.22136916186820219, "frac_reward_zero_std": 0.0, "grad_norm": 0.09086233295988484, "kl": 0.002084367675706744, "learning_rate": 8.845253226702103e-07, "loss": 0.0545, "num_tokens": 29945585.0, "reward": 0.875, "reward_std": 0.7194125056266785, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.7810479402542114, "sampling/importance_sampling_ratio/mean": 0.9998917579650879, "sampling/importance_sampling_ratio/min": 0.3738689124584198, "sampling/sampling_logp_difference/max": 0.9838500618934631, "sampling/sampling_logp_difference/mean": 0.011448423378169537, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6497.0, "completions/max_terminated_length": 6497.0, "completions/mean_length": 2766.666748046875, "completions/mean_terminated_length": 2766.666748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3585687056183815, "epoch": 0.2220089571337172, "frac_reward_zero_std": 0.0, "grad_norm": 0.09272988176353658, "kl": 0.053776560293044895, "learning_rate": 8.838821702103519e-07, "loss": -0.0805, "num_tokens": 30029617.0, "reward": 1.125, "reward_std": 0.9094502329826355, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.828001618385315, "sampling/importance_sampling_ratio/mean": 1.0000911951065063, "sampling/importance_sampling_ratio/min": 0.6367035508155823, "sampling/sampling_logp_difference/max": 0.6032233238220215, "sampling/sampling_logp_difference/mean": 0.009808065369725227, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5600.0, "completions/max_terminated_length": 5600.0, "completions/mean_length": 3061.70849609375, "completions/mean_terminated_length": 3061.70849609375, "completions/min_length": 1543.0, "completions/min_terminated_length": 1543.0, "entropy": 0.3513096570968628, "epoch": 0.22264875239923224, "frac_reward_zero_std": 0.0, "grad_norm": 0.09007339758930123, "kl": 0.0033749114954844117, "learning_rate": 8.832374668655329e-07, "loss": -0.0501, "num_tokens": 30115282.0, "reward": 1.2916667461395264, "reward_std": 0.8093277812004089, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.8887829780578613, "sampling/importance_sampling_ratio/mean": 1.0000876188278198, "sampling/importance_sampling_ratio/min": 0.6177377104759216, "sampling/sampling_logp_difference/max": 0.6359326839447021, "sampling/sampling_logp_difference/mean": 0.010017436929047108, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1919.0, "completions/max_terminated_length": 1919.0, "completions/mean_length": 1339.916748046875, "completions/mean_terminated_length": 1339.916748046875, "completions/min_length": 812.0, "completions/min_terminated_length": 812.0, "entropy": 0.16018547303974628, "epoch": 0.2232885476647473, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.1007845219546839, "kl": 0.0018482583109289408, "learning_rate": 8.825912152403567e-07, "loss": 0.0336, "num_tokens": 30168912.0, "reward": 1.3333333730697632, "reward_std": 0.44819486141204834, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.402309775352478, "sampling/importance_sampling_ratio/mean": 0.9998766779899597, "sampling/importance_sampling_ratio/min": 0.6301547288894653, "sampling/sampling_logp_difference/max": 0.4617898464202881, "sampling/sampling_logp_difference/mean": 0.005230308976024389, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 6941.0, "completions/mean_length": 3896.541748046875, "completions/mean_terminated_length": 3037.449951171875, "completions/min_length": 551.0, "completions/min_terminated_length": 551.0, "entropy": 0.45641401410102844, "epoch": 0.22392834293026231, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08502291007550973, "kl": 0.0017898060905281454, "learning_rate": 8.819434179456813e-07, "loss": 0.1382, "num_tokens": 30271869.0, "reward": 1.375, "reward_std": 0.5930407047271729, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999576210975647, "sampling/importance_sampling_ratio/min": 0.5835857391357422, "sampling/sampling_logp_difference/max": 1.6087427139282227, "sampling/sampling_logp_difference/mean": 0.012020895257592201, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5104.0, "completions/max_terminated_length": 5104.0, "completions/mean_length": 2694.08349609375, "completions/mean_terminated_length": 2694.08349609375, "completions/min_length": 1237.0, "completions/min_terminated_length": 1237.0, "entropy": 0.3273267298936844, "epoch": 0.22456813819577734, "frac_reward_zero_std": 0.0, "grad_norm": 0.08105837243947064, "kl": 0.0021533854596782476, "learning_rate": 8.812940775986097e-07, "loss": -0.0236, "num_tokens": 30350359.0, "reward": 1.375, "reward_std": 0.6055297255516052, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999004006385803, "sampling/importance_sampling_ratio/min": 0.6113008260726929, "sampling/sampling_logp_difference/max": 2.8650238513946533, "sampling/sampling_logp_difference/mean": 0.009497851133346558, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3999.0, "completions/max_terminated_length": 3999.0, "completions/mean_length": 2050.166748046875, "completions/mean_terminated_length": 2050.166748046875, "completions/min_length": 1085.0, "completions/min_terminated_length": 1085.0, "entropy": 0.22086190432310104, "epoch": 0.2252079334612924, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08219614158945604, "kl": 0.0019403882615733892, "learning_rate": 8.806431968224784e-07, "loss": 0.0447, "num_tokens": 30424531.0, "reward": 1.0833333730697632, "reward_std": 0.33247750997543335, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998466372489929, "sampling/importance_sampling_ratio/min": 0.3843207359313965, "sampling/sampling_logp_difference/max": 0.9562778472900391, "sampling/sampling_logp_difference/mean": 0.006557693239301443, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5814.0, "completions/max_terminated_length": 5814.0, "completions/mean_length": 2574.166748046875, "completions/mean_terminated_length": 2574.166748046875, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.329226478934288, "epoch": 0.22584772872680742, "frac_reward_zero_std": 0.0, "grad_norm": 0.1028054925635398, "kl": 0.08029088244074956, "learning_rate": 8.799907782468473e-07, "loss": -0.0713, "num_tokens": 30503935.0, "reward": 1.4583333730697632, "reward_std": 0.8017482757568359, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5791767835617065, "sampling/importance_sampling_ratio/mean": 0.9998679161071777, "sampling/importance_sampling_ratio/min": 0.5320333242416382, "sampling/sampling_logp_difference/max": 0.6310491561889648, "sampling/sampling_logp_difference/mean": 0.008158158510923386, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5511.0, "completions/max_terminated_length": 5511.0, "completions/mean_length": 2029.291748046875, "completions/mean_terminated_length": 2029.291748046875, "completions/min_length": 957.0, "completions/min_terminated_length": 957.0, "entropy": 0.33138370513916016, "epoch": 0.22648752399232247, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0890679088889181, "kl": 0.002313807053724304, "learning_rate": 8.793368245074895e-07, "loss": -0.0668, "num_tokens": 30564198.0, "reward": 1.3333333730697632, "reward_std": 0.30860671401023865, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.3801708221435547, "sampling/importance_sampling_ratio/mean": 1.0001912117004395, "sampling/importance_sampling_ratio/min": 0.7143115997314453, "sampling/sampling_logp_difference/max": 0.33643603324890137, "sampling/sampling_logp_difference/mean": 0.00896228663623333, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1666666716337204, "completions/max_length": 8192.0, "completions/max_terminated_length": 5522.0, "completions/mean_length": 3241.666748046875, "completions/mean_terminated_length": 2251.60009765625, "completions/min_length": 49.0, "completions/min_terminated_length": 49.0, "entropy": 0.39252036809921265, "epoch": 0.2271273192578375, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05420888374359007, "kl": 0.002274273370858282, "learning_rate": 8.786813382463795e-07, "loss": 0.0897, "num_tokens": 30679622.0, "reward": 0.7916666865348816, "reward_std": 0.46798479557037354, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.7350419759750366, "sampling/importance_sampling_ratio/mean": 0.9998874068260193, "sampling/importance_sampling_ratio/min": 0.5264867544174194, "sampling/sampling_logp_difference/max": 0.6415290832519531, "sampling/sampling_logp_difference/mean": 0.009020006284117699, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7519.0, "completions/max_terminated_length": 7519.0, "completions/mean_length": 3302.166748046875, "completions/mean_terminated_length": 3302.166748046875, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "entropy": 0.33335933089256287, "epoch": 0.22776711452335252, "frac_reward_zero_std": 0.0, "grad_norm": 0.09349944625760845, "kl": 0.0018437823455315083, "learning_rate": 8.780243221116837e-07, "loss": -0.0196, "num_tokens": 30775746.0, "reward": 1.1666667461395264, "reward_std": 0.4446708858013153, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.452656626701355, "sampling/importance_sampling_ratio/mean": 0.9998298287391663, "sampling/importance_sampling_ratio/min": 0.33326035737991333, "sampling/sampling_logp_difference/max": 1.0988311767578125, "sampling/sampling_logp_difference/mean": 0.00871565192937851, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4468.0, "completions/max_terminated_length": 4468.0, "completions/mean_length": 2180.25, "completions/mean_terminated_length": 2180.25, "completions/min_length": 821.0, "completions/min_terminated_length": 821.0, "entropy": 0.27773313969373703, "epoch": 0.22840690978886757, "frac_reward_zero_std": 0.0, "grad_norm": 0.09470183105370181, "kl": 0.001920444512506947, "learning_rate": 8.773657787577488e-07, "loss": 0.0245, "num_tokens": 30844688.0, "reward": 1.0833333730697632, "reward_std": 0.7350383996963501, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.4312636852264404, "sampling/importance_sampling_ratio/mean": 1.0000805854797363, "sampling/importance_sampling_ratio/min": 0.45751431584358215, "sampling/sampling_logp_difference/max": 0.781947135925293, "sampling/sampling_logp_difference/mean": 0.007336476352065802, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5021.0, "completions/max_terminated_length": 5021.0, "completions/mean_length": 3178.45849609375, "completions/mean_terminated_length": 3178.45849609375, "completions/min_length": 1250.0, "completions/min_terminated_length": 1250.0, "entropy": 0.45367318391799927, "epoch": 0.2290467050543826, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.07378188635045975, "kl": 0.0023261570022441447, "learning_rate": 8.767057108450917e-07, "loss": -0.0035, "num_tokens": 30935427.0, "reward": 1.1666667461395264, "reward_std": 0.2519763112068176, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.633933186531067, "sampling/importance_sampling_ratio/mean": 0.9999335408210754, "sampling/importance_sampling_ratio/min": 0.47683656215667725, "sampling/sampling_logp_difference/max": 0.7405815124511719, "sampling/sampling_logp_difference/mean": 0.012094324454665184, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5248.0, "completions/max_terminated_length": 5248.0, "completions/mean_length": 2291.416748046875, "completions/mean_terminated_length": 2291.416748046875, "completions/min_length": 955.0, "completions/min_terminated_length": 955.0, "entropy": 0.3221847862005234, "epoch": 0.22968650031989762, "frac_reward_zero_std": 0.0, "grad_norm": 0.10635541024087576, "kl": 0.0022994405298959464, "learning_rate": 8.760441210403885e-07, "loss": -0.0388, "num_tokens": 31007045.0, "reward": 1.4583333730697632, "reward_std": 0.7767796516418457, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5452390909194946, "sampling/importance_sampling_ratio/mean": 0.9999706745147705, "sampling/importance_sampling_ratio/min": 0.5375425815582275, "sampling/sampling_logp_difference/max": 0.6207473278045654, "sampling/sampling_logp_difference/mean": 0.008965255692601204, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 8192.0, "completions/max_terminated_length": 7771.0, "completions/mean_length": 4349.95849609375, "completions/mean_terminated_length": 3801.09521484375, "completions/min_length": 1078.0, "completions/min_terminated_length": 1078.0, "entropy": 0.48607562482357025, "epoch": 0.23032629558541268, "frac_reward_zero_std": 0.0, "grad_norm": 0.08801354120556858, "kl": 0.0020431501034181565, "learning_rate": 8.753810120164637e-07, "loss": 0.1606, "num_tokens": 31126268.0, "reward": 0.9166666865348816, "reward_std": 0.4993361234664917, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.5772626399993896, "sampling/importance_sampling_ratio/mean": 0.9999969601631165, "sampling/importance_sampling_ratio/min": 0.5437886118888855, "sampling/sampling_logp_difference/max": 0.6091946363449097, "sampling/sampling_logp_difference/mean": 0.012709814123809338, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3036.0, "completions/max_terminated_length": 3036.0, "completions/mean_length": 1824.916748046875, "completions/mean_terminated_length": 1824.916748046875, "completions/min_length": 616.0, "completions/min_terminated_length": 616.0, "entropy": 0.3260195329785347, "epoch": 0.2309660908509277, "frac_reward_zero_std": 0.0, "grad_norm": 0.11673352514960357, "kl": 0.0017023338878061622, "learning_rate": 8.747163864522795e-07, "loss": -0.0033, "num_tokens": 31183282.0, "reward": 1.625, "reward_std": 0.6784341335296631, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.6523258686065674, "sampling/importance_sampling_ratio/mean": 1.0002561807632446, "sampling/importance_sampling_ratio/min": 0.5310205221176147, "sampling/sampling_logp_difference/max": 0.6329545974731445, "sampling/sampling_logp_difference/mean": 0.009237024933099747, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6029.0, "completions/max_terminated_length": 6029.0, "completions/mean_length": 3457.625, "completions/mean_terminated_length": 3457.625, "completions/min_length": 1441.0, "completions/min_terminated_length": 1441.0, "entropy": 0.29283686354756355, "epoch": 0.23160588611644273, "frac_reward_zero_std": 0.0, "grad_norm": 0.11847773196356709, "kl": 0.0015827445895411074, "learning_rate": 8.74050247032925e-07, "loss": 0.0598, "num_tokens": 31289921.0, "reward": 0.8333333730697632, "reward_std": 0.6752025485038757, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6293495893478394, "sampling/importance_sampling_ratio/mean": 0.9998762011528015, "sampling/importance_sampling_ratio/min": 0.5175476670265198, "sampling/sampling_logp_difference/max": 0.6586536169052124, "sampling/sampling_logp_difference/mean": 0.008785108104348183, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7535.0, "completions/max_terminated_length": 7535.0, "completions/mean_length": 3553.375, "completions/mean_terminated_length": 3553.375, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.3160405457019806, "epoch": 0.23224568138195778, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05719100149817917, "kl": 0.06395166588481516, "learning_rate": 8.733825964496051e-07, "loss": 0.0276, "num_tokens": 31393362.0, "reward": 0.2916666865348816, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.25, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999284148216248, "sampling/importance_sampling_ratio/min": 0.6192075610160828, "sampling/sampling_logp_difference/max": 1.0212514400482178, "sampling/sampling_logp_difference/mean": 0.009445506148040295, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6402.0, "completions/mean_length": 4289.7919921875, "completions/mean_terminated_length": 4120.13037109375, "completions/min_length": 1687.0, "completions/min_terminated_length": 1687.0, "entropy": 0.2872868925333023, "epoch": 0.2328854766474728, "frac_reward_zero_std": 0.0, "grad_norm": 0.07456640273634928, "kl": 0.0015790683683007956, "learning_rate": 8.727134373996297e-07, "loss": 0.0868, "num_tokens": 31527021.0, "reward": 1.375, "reward_std": 0.897425651550293, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.891119360923767, "sampling/importance_sampling_ratio/mean": 1.0000427961349487, "sampling/importance_sampling_ratio/min": 0.6108402013778687, "sampling/sampling_logp_difference/max": 0.6371688842773438, "sampling/sampling_logp_difference/mean": 0.007943147793412209, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7605.0, "completions/max_terminated_length": 7605.0, "completions/mean_length": 3823.25, "completions/mean_terminated_length": 3823.25, "completions/min_length": 1800.0, "completions/min_terminated_length": 1800.0, "entropy": 0.48485352098941803, "epoch": 0.23352527191298783, "frac_reward_zero_std": 0.0, "grad_norm": 0.10353545074365507, "kl": 0.0025777737027965486, "learning_rate": 8.720427725864034e-07, "loss": 0.0359, "num_tokens": 31633419.0, "reward": 1.4583333730697632, "reward_std": 0.7712149024009705, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.4220783710479736, "sampling/importance_sampling_ratio/mean": 0.9999728798866272, "sampling/importance_sampling_ratio/min": 0.21859772503376007, "sampling/sampling_logp_difference/max": 1.520522117614746, "sampling/sampling_logp_difference/mean": 0.012750916182994843, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4970.0, "completions/max_terminated_length": 4970.0, "completions/mean_length": 2693.25, "completions/mean_terminated_length": 2693.25, "completions/min_length": 1347.0, "completions/min_terminated_length": 1347.0, "entropy": 0.2306753285229206, "epoch": 0.23416506717850288, "frac_reward_zero_std": 0.0, "grad_norm": 0.08646035075143686, "kl": 0.0018678191991057247, "learning_rate": 8.713706047194135e-07, "loss": -0.0171, "num_tokens": 31734345.0, "reward": 1.5, "reward_std": 0.8893417716026306, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5938739776611328, "sampling/importance_sampling_ratio/mean": 1.0002297163009644, "sampling/importance_sampling_ratio/min": 0.37898316979408264, "sampling/sampling_logp_difference/max": 0.9702634811401367, "sampling/sampling_logp_difference/mean": 0.006943196058273315, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7248.0, "completions/max_terminated_length": 7248.0, "completions/mean_length": 2878.416748046875, "completions/mean_terminated_length": 2878.416748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3774978518486023, "epoch": 0.2348048624440179, "frac_reward_zero_std": 0.0, "grad_norm": 0.0878304982346234, "kl": 0.024102517432766035, "learning_rate": 8.7069693651422e-07, "loss": -0.07, "num_tokens": 31818811.0, "reward": 0.9166666865348816, "reward_std": 0.5986984968185425, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0002573728561401, "sampling/importance_sampling_ratio/min": 0.3565075397491455, "sampling/sampling_logp_difference/max": 1.3008087873458862, "sampling/sampling_logp_difference/mean": 0.009384846314787865, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5837.0, "completions/max_terminated_length": 5837.0, "completions/mean_length": 3396.20849609375, "completions/mean_terminated_length": 3396.20849609375, "completions/min_length": 1511.0, "completions/min_terminated_length": 1511.0, "entropy": 0.4175002947449684, "epoch": 0.23544465770953296, "frac_reward_zero_std": 0.0, "grad_norm": 0.09232895156734734, "kl": 0.0025675345095805824, "learning_rate": 8.700217706924444e-07, "loss": -0.025, "num_tokens": 31912704.0, "reward": 1.0416667461395264, "reward_std": 0.6055297255516052, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5008234977722168, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.5327754020690918, "sampling/sampling_logp_difference/max": 0.6296553611755371, "sampling/sampling_logp_difference/mean": 0.011469101533293724, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6158.0, "completions/mean_length": 3606.08349609375, "completions/mean_terminated_length": 3406.69580078125, "completions/min_length": 1483.0, "completions/min_terminated_length": 1483.0, "entropy": 0.3142111524939537, "epoch": 0.236084452975048, "frac_reward_zero_std": 0.0, "grad_norm": 0.11819675663126698, "kl": 0.0018086946802213788, "learning_rate": 8.693451099817581e-07, "loss": 0.2044, "num_tokens": 32023762.0, "reward": 1.1666667461395264, "reward_std": 0.7463539838790894, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.999854564666748, "sampling/importance_sampling_ratio/min": 0.35815340280532837, "sampling/sampling_logp_difference/max": 1.0267939567565918, "sampling/sampling_logp_difference/mean": 0.009041747078299522, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2876.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 1617.2083740234375, "completions/mean_terminated_length": 1617.2083740234375, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "entropy": 0.27304718270897865, "epoch": 0.236724248240563, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0782449794272042, "kl": 0.00309557281434536, "learning_rate": 8.686669571158722e-07, "loss": -0.0205, "num_tokens": 32087079.0, "reward": 1.0, "reward_std": 0.5605830550193787, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5146771669387817, "sampling/importance_sampling_ratio/mean": 1.0001322031021118, "sampling/importance_sampling_ratio/min": 0.5196537971496582, "sampling/sampling_logp_difference/max": 0.6545925140380859, "sampling/sampling_logp_difference/mean": 0.007568106055259705, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4919.0, "completions/max_terminated_length": 4919.0, "completions/mean_length": 2348.666748046875, "completions/mean_terminated_length": 2348.666748046875, "completions/min_length": 1127.0, "completions/min_terminated_length": 1127.0, "entropy": 0.41199320554733276, "epoch": 0.23736404350607807, "frac_reward_zero_std": 0.0, "grad_norm": 0.1175565946790491, "kl": 0.0024311536108143628, "learning_rate": 8.679873148345261e-07, "loss": 0.0159, "num_tokens": 32154567.0, "reward": 0.9166666865348816, "reward_std": 0.6480017900466919, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.695764422416687, "sampling/importance_sampling_ratio/mean": 0.999928891658783, "sampling/importance_sampling_ratio/min": 0.642716109752655, "sampling/sampling_logp_difference/max": 0.5281336307525635, "sampling/sampling_logp_difference/mean": 0.010883405804634094, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4131.0, "completions/max_terminated_length": 4131.0, "completions/mean_length": 1906.0833740234375, "completions/mean_terminated_length": 1906.0833740234375, "completions/min_length": 1121.0, "completions/min_terminated_length": 1121.0, "entropy": 0.20002714917063713, "epoch": 0.2380038387715931, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07755073711427192, "kl": 0.0018460355058778077, "learning_rate": 8.673061858834765e-07, "loss": -0.0109, "num_tokens": 32219297.0, "reward": 1.5416667461395264, "reward_std": 0.3535533845424652, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.5456762313842773, "sampling/importance_sampling_ratio/mean": 0.9998446106910706, "sampling/importance_sampling_ratio/min": 0.5085968971252441, "sampling/sampling_logp_difference/max": 0.6760995388031006, "sampling/sampling_logp_difference/mean": 0.006289754994213581, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5200.0, "completions/max_terminated_length": 5200.0, "completions/mean_length": 2397.041748046875, "completions/mean_terminated_length": 2397.041748046875, "completions/min_length": 633.0, "completions/min_terminated_length": 633.0, "entropy": 0.27855608239769936, "epoch": 0.23864363403710812, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08575375334160229, "kl": 0.0022526354005094618, "learning_rate": 8.666235730144857e-07, "loss": -0.0727, "num_tokens": 32294754.0, "reward": 1.375, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5415550470352173, "sampling/importance_sampling_ratio/mean": 1.0000640153884888, "sampling/importance_sampling_ratio/min": 0.27445393800735474, "sampling/sampling_logp_difference/max": 1.2929718494415283, "sampling/sampling_logp_difference/mean": 0.008477342315018177, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3524.0, "completions/max_terminated_length": 3524.0, "completions/mean_length": 2261.791748046875, "completions/mean_terminated_length": 2261.791748046875, "completions/min_length": 1455.0, "completions/min_terminated_length": 1455.0, "entropy": 0.23241941630840302, "epoch": 0.23928342930262317, "frac_reward_zero_std": 0.0, "grad_norm": 0.09663933858595787, "kl": 0.002187374688219279, "learning_rate": 8.659394789853118e-07, "loss": -0.012, "num_tokens": 32368765.0, "reward": 1.3333333730697632, "reward_std": 0.8869583606719971, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.3939704895019531, "sampling/importance_sampling_ratio/mean": 1.0001195669174194, "sampling/importance_sampling_ratio/min": 0.4972170889377594, "sampling/sampling_logp_difference/max": 0.6987285614013672, "sampling/sampling_logp_difference/mean": 0.006919467356055975, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6683.0, "completions/max_terminated_length": 6683.0, "completions/mean_length": 3566.166748046875, "completions/mean_terminated_length": 3566.166748046875, "completions/min_length": 1564.0, "completions/min_terminated_length": 1564.0, "entropy": 0.5099367648363113, "epoch": 0.2399232245681382, "frac_reward_zero_std": 0.0, "grad_norm": 0.1125888104038024, "kl": 0.0030111020314507186, "learning_rate": 8.652539065596965e-07, "loss": -0.0077, "num_tokens": 32464953.0, "reward": 1.0, "reward_std": 0.5260697603225708, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.5984938144683838, "sampling/importance_sampling_ratio/mean": 1.000229001045227, "sampling/importance_sampling_ratio/min": 0.40776726603507996, "sampling/sampling_logp_difference/max": 0.8970587253570557, "sampling/sampling_logp_difference/mean": 0.013563418760895729, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6344.0, "completions/max_terminated_length": 6344.0, "completions/mean_length": 3233.541748046875, "completions/mean_terminated_length": 3233.541748046875, "completions/min_length": 889.0, "completions/min_terminated_length": 889.0, "entropy": 0.4730556085705757, "epoch": 0.24056301983365322, "frac_reward_zero_std": 0.0, "grad_norm": 0.10138569482021519, "kl": 0.0022931554121896625, "learning_rate": 8.645668585073538e-07, "loss": -0.0638, "num_tokens": 32555398.0, "reward": 0.9166666865348816, "reward_std": 0.6536394953727722, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.459604024887085, "sampling/importance_sampling_ratio/mean": 1.0001845359802246, "sampling/importance_sampling_ratio/min": 0.7136679887771606, "sampling/sampling_logp_difference/max": 0.37816524505615234, "sampling/sampling_logp_difference/mean": 0.01241067610681057, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2083333432674408, "completions/max_length": 8192.0, "completions/max_terminated_length": 8134.0, "completions/mean_length": 5257.5419921875, "completions/mean_terminated_length": 4485.31591796875, "completions/min_length": 1590.0, "completions/min_terminated_length": 1590.0, "entropy": 0.5691364035010338, "epoch": 0.24120281509916827, "frac_reward_zero_std": 0.0, "grad_norm": 0.07411762841221918, "kl": 0.0023480578674934804, "learning_rate": 8.6387833760396e-07, "loss": 0.0813, "num_tokens": 32696763.0, "reward": 0.625, "reward_std": 0.5794824361801147, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000232458114624, "sampling/importance_sampling_ratio/min": 0.40318670868873596, "sampling/sampling_logp_difference/max": 0.9441713094711304, "sampling/sampling_logp_difference/mean": 0.014554668217897415, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7573.0, "completions/mean_length": 3871.75, "completions/mean_terminated_length": 3683.9130859375, "completions/min_length": 1829.0, "completions/min_terminated_length": 1829.0, "entropy": 0.30523499846458435, "epoch": 0.2418426103646833, "frac_reward_zero_std": 0.0, "grad_norm": 0.08219484907408585, "kl": 0.0022518352488987148, "learning_rate": 8.631883466311411e-07, "loss": 0.102, "num_tokens": 32808189.0, "reward": 1.2083333730697632, "reward_std": 0.8224833011627197, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5910650491714478, "sampling/importance_sampling_ratio/mean": 1.0002102851867676, "sampling/importance_sampling_ratio/min": 0.4191672205924988, "sampling/sampling_logp_difference/max": 0.8694853782653809, "sampling/sampling_logp_difference/mean": 0.008833091706037521, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7802.0, "completions/mean_length": 3383.916748046875, "completions/mean_terminated_length": 3174.86962890625, "completions/min_length": 1299.0, "completions/min_terminated_length": 1299.0, "entropy": 0.3390166461467743, "epoch": 0.24248240563019835, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07973839908453133, "kl": 0.0021931225492153317, "learning_rate": 8.624968883764625e-07, "loss": 0.037, "num_tokens": 32911195.0, "reward": 0.875, "reward_std": 0.3698274493217468, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000406503677368, "sampling/importance_sampling_ratio/min": 0.5735681056976318, "sampling/sampling_logp_difference/max": 0.7321996688842773, "sampling/sampling_logp_difference/mean": 0.00944510381668806, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4439.0, "completions/max_terminated_length": 4439.0, "completions/mean_length": 2327.25, "completions/mean_terminated_length": 2327.25, "completions/min_length": 1251.0, "completions/min_terminated_length": 1251.0, "entropy": 0.26663850992918015, "epoch": 0.24312220089571338, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.11309444482293732, "kl": 0.0018115928978659213, "learning_rate": 8.618039656334173e-07, "loss": -0.0642, "num_tokens": 32988313.0, "reward": 1.1666667461395264, "reward_std": 0.4497717618942261, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5097007751464844, "sampling/importance_sampling_ratio/mean": 0.9999885559082031, "sampling/importance_sampling_ratio/min": 0.6043734550476074, "sampling/sampling_logp_difference/max": 0.5035629272460938, "sampling/sampling_logp_difference/mean": 0.0076091960072517395, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4448.0, "completions/max_terminated_length": 4448.0, "completions/mean_length": 2652.541748046875, "completions/mean_terminated_length": 2652.541748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5267564579844475, "epoch": 0.2437619961612284, "frac_reward_zero_std": 0.0, "grad_norm": 0.10953038302262777, "kl": 0.002580779488198459, "learning_rate": 8.611095812014153e-07, "loss": -0.0389, "num_tokens": 33064454.0, "reward": 0.75, "reward_std": 0.9006573557853699, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7926123142242432, "sampling/importance_sampling_ratio/mean": 0.9999333024024963, "sampling/importance_sampling_ratio/min": 0.5672857761383057, "sampling/sampling_logp_difference/max": 0.5836739540100098, "sampling/sampling_logp_difference/mean": 0.011077853851020336, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5585.0, "completions/max_terminated_length": 5585.0, "completions/mean_length": 2343.20849609375, "completions/mean_terminated_length": 2343.20849609375, "completions/min_length": 792.0, "completions/min_terminated_length": 792.0, "entropy": 0.31509777158498764, "epoch": 0.24440179142674345, "frac_reward_zero_std": 0.0, "grad_norm": 0.13963644500318803, "kl": 0.002804766409099102, "learning_rate": 8.604137378857712e-07, "loss": -0.0204, "num_tokens": 33132363.0, "reward": 1.1666667461395264, "reward_std": 0.7866922616958618, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.8464089632034302, "sampling/importance_sampling_ratio/mean": 0.9999086260795593, "sampling/importance_sampling_ratio/min": 0.08583681285381317, "sampling/sampling_logp_difference/max": 2.4553072452545166, "sampling/sampling_logp_difference/mean": 0.00893223937600851, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4464.0, "completions/max_terminated_length": 4464.0, "completions/mean_length": 2431.416748046875, "completions/mean_terminated_length": 2431.416748046875, "completions/min_length": 966.0, "completions/min_terminated_length": 966.0, "entropy": 0.23674091696739197, "epoch": 0.24504158669225848, "frac_reward_zero_std": 0.0, "grad_norm": 0.07780278162492246, "kl": 0.0018334749329369515, "learning_rate": 8.597164384976938e-07, "loss": -0.1063, "num_tokens": 33212517.0, "reward": 1.3333333730697632, "reward_std": 0.789085328578949, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.4031113386154175, "sampling/importance_sampling_ratio/mean": 1.0001168251037598, "sampling/importance_sampling_ratio/min": 0.5575536489486694, "sampling/sampling_logp_difference/max": 0.5841965675354004, "sampling/sampling_logp_difference/mean": 0.007335100322961807, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7255.0, "completions/max_terminated_length": 7255.0, "completions/mean_length": 3224.291748046875, "completions/mean_terminated_length": 3224.291748046875, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.4291822835803032, "epoch": 0.2456813819577735, "frac_reward_zero_std": 0.0, "grad_norm": 0.07470958782037243, "kl": 0.2118362431647256, "learning_rate": 8.590176858542747e-07, "loss": -0.1192, "num_tokens": 33304668.0, "reward": 1.625, "reward_std": 0.7194125056266785, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.687756896018982, "sampling/importance_sampling_ratio/mean": 0.9999380111694336, "sampling/importance_sampling_ratio/min": 0.4848935306072235, "sampling/sampling_logp_difference/max": 0.7238259315490723, "sampling/sampling_logp_difference/mean": 0.010636106133460999, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6612.0, "completions/mean_length": 2852.70849609375, "completions/mean_terminated_length": 2620.565185546875, "completions/min_length": 1245.0, "completions/min_terminated_length": 1245.0, "entropy": 0.35268060863018036, "epoch": 0.24632117722328856, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10005949922660205, "kl": 0.0018316333589609712, "learning_rate": 8.583174827784761e-07, "loss": 0.0454, "num_tokens": 33385429.0, "reward": 1.3333333730697632, "reward_std": 0.4714045226573944, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.3749830722808838, "sampling/importance_sampling_ratio/mean": 1.0003598928451538, "sampling/importance_sampling_ratio/min": 0.4420264959335327, "sampling/sampling_logp_difference/max": 0.8163855075836182, "sampling/sampling_logp_difference/mean": 0.009714926593005657, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7940.0, "completions/mean_length": 3824.791748046875, "completions/mean_terminated_length": 3634.9130859375, "completions/min_length": 1067.0, "completions/min_terminated_length": 1067.0, "entropy": 0.46554146707057953, "epoch": 0.24696097248880358, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09101352730460224, "kl": 0.003125332703348249, "learning_rate": 8.576158320991204e-07, "loss": 0.1375, "num_tokens": 33492160.0, "reward": 0.5833333730697632, "reward_std": 0.4629100561141968, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9998782277107239, "sampling/importance_sampling_ratio/min": 0.4377730190753937, "sampling/sampling_logp_difference/max": 1.2980608940124512, "sampling/sampling_logp_difference/mean": 0.012752949260175228, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3942.0, "completions/max_terminated_length": 3942.0, "completions/mean_length": 2401.75, "completions/mean_terminated_length": 2401.75, "completions/min_length": 1149.0, "completions/min_terminated_length": 1149.0, "entropy": 0.28760963678359985, "epoch": 0.2476007677543186, "frac_reward_zero_std": 0.0, "grad_norm": 0.11045341902028456, "kl": 0.002063578722300008, "learning_rate": 8.569127366508782e-07, "loss": 0.0297, "num_tokens": 33566434.0, "reward": 1.2916667461395264, "reward_std": 0.603628933429718, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.6540333032608032, "sampling/importance_sampling_ratio/mean": 0.9997016787528992, "sampling/importance_sampling_ratio/min": 0.41799303889274597, "sampling/sampling_logp_difference/max": 0.8722904920578003, "sampling/sampling_logp_difference/mean": 0.008293719962239265, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7355.0, "completions/mean_length": 4134.6669921875, "completions/mean_terminated_length": 3765.818359375, "completions/min_length": 1434.0, "completions/min_terminated_length": 1434.0, "entropy": 0.4026099145412445, "epoch": 0.24824056301983366, "frac_reward_zero_std": 0.0, "grad_norm": 0.10090300765093564, "kl": 0.002196443296270445, "learning_rate": 8.562081992742567e-07, "loss": 0.1132, "num_tokens": 33680266.0, "reward": 0.625, "reward_std": 0.7442201972007751, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.7748198509216309, "sampling/importance_sampling_ratio/mean": 1.000043272972107, "sampling/importance_sampling_ratio/min": 0.44710806012153625, "sampling/sampling_logp_difference/max": 0.804955005645752, "sampling/sampling_logp_difference/mean": 0.011684934608638287, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4167.0, "completions/max_terminated_length": 4167.0, "completions/mean_length": 2622.0, "completions/mean_terminated_length": 2622.0, "completions/min_length": 1306.0, "completions/min_terminated_length": 1306.0, "entropy": 0.10494825802743435, "epoch": 0.24888035828534869, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.05474029881980868, "kl": 0.0011206766503164545, "learning_rate": 8.555022228155889e-07, "loss": -0.0333, "num_tokens": 33783130.0, "reward": 1.1666667461395264, "reward_std": 0.39000558853149414, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.9611144065856934, "sampling/importance_sampling_ratio/mean": 0.9999637603759766, "sampling/importance_sampling_ratio/min": 0.5515662431716919, "sampling/sampling_logp_difference/max": 0.6735129356384277, "sampling/sampling_logp_difference/mean": 0.0033004581928253174, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4845.0, "completions/max_terminated_length": 4845.0, "completions/mean_length": 2758.70849609375, "completions/mean_terminated_length": 2758.70849609375, "completions/min_length": 1497.0, "completions/min_terminated_length": 1497.0, "entropy": 0.2387045994400978, "epoch": 0.2495201535508637, "frac_reward_zero_std": 0.0, "grad_norm": 0.08525409150154979, "kl": 0.001869861618615687, "learning_rate": 8.547948101270215e-07, "loss": -0.068, "num_tokens": 33875907.0, "reward": 0.9583333730697632, "reward_std": 0.6490218043327332, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.7597070932388306, "sampling/importance_sampling_ratio/mean": 1.0000596046447754, "sampling/importance_sampling_ratio/min": 0.5928007364273071, "sampling/sampling_logp_difference/max": 0.5651473999023438, "sampling/sampling_logp_difference/mean": 0.00695901270955801, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7114.0, "completions/mean_length": 3534.041748046875, "completions/mean_terminated_length": 3331.521728515625, "completions/min_length": 1224.0, "completions/min_terminated_length": 1224.0, "entropy": 0.3727884367108345, "epoch": 0.25015994881637876, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0736382780692213, "kl": 0.0024940618313848972, "learning_rate": 8.540859640665035e-07, "loss": 0.0725, "num_tokens": 33982868.0, "reward": 0.4583333432674408, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4828062057495117, "sampling/importance_sampling_ratio/mean": 0.999805748462677, "sampling/importance_sampling_ratio/min": 0.5411491990089417, "sampling/sampling_logp_difference/max": 0.6140602827072144, "sampling/sampling_logp_difference/mean": 0.01044220756739378, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4763.0, "completions/max_terminated_length": 4763.0, "completions/mean_length": 2643.25, "completions/mean_terminated_length": 2643.25, "completions/min_length": 1279.0, "completions/min_terminated_length": 1279.0, "entropy": 0.4614550620317459, "epoch": 0.2507997440818938, "frac_reward_zero_std": 0.0, "grad_norm": 0.10193713031058808, "kl": 0.0024154451675713062, "learning_rate": 8.53375687497775e-07, "loss": -0.0056, "num_tokens": 34056282.0, "reward": 1.0833333730697632, "reward_std": 0.5201624631881714, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.5966168642044067, "sampling/importance_sampling_ratio/mean": 0.9999458193778992, "sampling/importance_sampling_ratio/min": 0.7112767100334167, "sampling/sampling_logp_difference/max": 0.46788692474365234, "sampling/sampling_logp_difference/mean": 0.012314788065850735, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5455.0, "completions/max_terminated_length": 5455.0, "completions/mean_length": 2766.791748046875, "completions/mean_terminated_length": 2766.791748046875, "completions/min_length": 1740.0, "completions/min_terminated_length": 1740.0, "entropy": 0.3356524929404259, "epoch": 0.2514395393474088, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07242813934309418, "kl": 0.0021321903041098267, "learning_rate": 8.526639832903551e-07, "loss": 0.0123, "num_tokens": 34144317.0, "reward": 0.875, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6739168167114258, "sampling/importance_sampling_ratio/mean": 1.0001055002212524, "sampling/importance_sampling_ratio/min": 0.44474759697914124, "sampling/sampling_logp_difference/max": 0.8102483749389648, "sampling/sampling_logp_difference/mean": 0.009237266145646572, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6054.0, "completions/max_terminated_length": 6054.0, "completions/mean_length": 2949.916748046875, "completions/mean_terminated_length": 2949.916748046875, "completions/min_length": 1292.0, "completions/min_terminated_length": 1292.0, "entropy": 0.39653078466653824, "epoch": 0.25207933461292387, "frac_reward_zero_std": 0.0, "grad_norm": 0.10779372171134609, "kl": 0.002689950284548104, "learning_rate": 8.519508543195305e-07, "loss": -0.0111, "num_tokens": 34229187.0, "reward": 0.9166666865348816, "reward_std": 0.8140539526939392, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.5595934391021729, "sampling/importance_sampling_ratio/mean": 1.0001546144485474, "sampling/importance_sampling_ratio/min": 0.5031372904777527, "sampling/sampling_logp_difference/max": 0.6868922710418701, "sampling/sampling_logp_difference/mean": 0.010940290987491608, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2983.0, "completions/max_terminated_length": 2983.0, "completions/mean_length": 1554.75, "completions/mean_terminated_length": 1554.75, "completions/min_length": 1.0, "completions/min_terminated_length": 1.0, "entropy": 0.41169822588562965, "epoch": 0.2527191298784389, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0863913178177266, "kl": 0.01336806095787324, "learning_rate": 8.512363034663441e-07, "loss": -0.0916, "num_tokens": 34280869.0, "reward": 1.125, "reward_std": 0.17251639068126678, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148510992527008, "sampling/importance_sampling_ratio/max": 1.5959022045135498, "sampling/importance_sampling_ratio/mean": 1.000078797340393, "sampling/importance_sampling_ratio/min": 0.31538331508636475, "sampling/sampling_logp_difference/max": 1.1539664268493652, "sampling/sampling_logp_difference/mean": 0.008290551602840424, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5500.0, "completions/max_terminated_length": 5500.0, "completions/mean_length": 3106.83349609375, "completions/mean_terminated_length": 3106.83349609375, "completions/min_length": 1036.0, "completions/min_terminated_length": 1036.0, "entropy": 0.44777218252420425, "epoch": 0.2533589251439539, "frac_reward_zero_std": 0.0, "grad_norm": 0.12045463768916806, "kl": 0.002501695998944342, "learning_rate": 8.505203336175835e-07, "loss": -0.0574, "num_tokens": 34366297.0, "reward": 0.875, "reward_std": 0.6341476440429688, "rewards/accuracy_reward/mean": 0.2083333283662796, "rewards/accuracy_reward/std": 0.4148511290550232, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434515476227, "sampling/importance_sampling_ratio/max": 1.5271052122116089, "sampling/importance_sampling_ratio/mean": 1.0001519918441772, "sampling/importance_sampling_ratio/min": 0.6283902525901794, "sampling/sampling_logp_difference/max": 0.46459388732910156, "sampling/sampling_logp_difference/mean": 0.012416343204677105, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7159.0, "completions/mean_length": 4078.08349609375, "completions/mean_terminated_length": 3899.217529296875, "completions/min_length": 1386.0, "completions/min_terminated_length": 1386.0, "entropy": 0.41709716618061066, "epoch": 0.25399872040946897, "frac_reward_zero_std": 0.0, "grad_norm": 0.10009320786768426, "kl": 0.002198471745941788, "learning_rate": 8.498029476657684e-07, "loss": 0.0141, "num_tokens": 34478075.0, "reward": 0.7916666865348816, "reward_std": 0.6106518507003784, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.9791412353515625, "sampling/importance_sampling_ratio/mean": 0.9998809695243835, "sampling/importance_sampling_ratio/min": 0.451332151889801, "sampling/sampling_logp_difference/max": 0.7955517768859863, "sampling/sampling_logp_difference/mean": 0.011253377422690392, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7808.0, "completions/max_terminated_length": 7808.0, "completions/mean_length": 3096.875, "completions/mean_terminated_length": 3096.875, "completions/min_length": 1719.0, "completions/min_terminated_length": 1719.0, "entropy": 0.2685724310576916, "epoch": 0.254638515674984, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07896003423448365, "kl": 0.0018634321168065071, "learning_rate": 8.4908414850914e-07, "loss": 0.0279, "num_tokens": 34578736.0, "reward": 0.7916666865348816, "reward_std": 0.4082186222076416, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 0.9999451637268066, "sampling/importance_sampling_ratio/min": 0.3924522399902344, "sampling/sampling_logp_difference/max": 0.935340404510498, "sampling/sampling_logp_difference/mean": 0.008205209858715534, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 6629.0, "completions/mean_length": 2933.291748046875, "completions/mean_terminated_length": 2704.65234375, "completions/min_length": 1099.0, "completions/min_terminated_length": 1099.0, "entropy": 0.32758597284555435, "epoch": 0.255278310940499, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.0800473593344592, "kl": 0.0020337953174021095, "learning_rate": 8.483639390516487e-07, "loss": 0.0256, "num_tokens": 34664287.0, "reward": 1.0833333730697632, "reward_std": 0.15430335700511932, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.7651267051696777, "sampling/importance_sampling_ratio/mean": 1.0001344680786133, "sampling/importance_sampling_ratio/min": 0.288560688495636, "sampling/sampling_logp_difference/max": 1.2428498268127441, "sampling/sampling_logp_difference/mean": 0.009104838594794273, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2137.0, "completions/max_terminated_length": 2137.0, "completions/mean_length": 1566.916748046875, "completions/mean_terminated_length": 1566.916748046875, "completions/min_length": 938.0, "completions/min_terminated_length": 938.0, "entropy": 0.1910550557076931, "epoch": 0.2559181062060141, "frac_reward_zero_std": 0.0, "grad_norm": 0.10177765180132403, "kl": 0.002281327935634181, "learning_rate": 8.476423222029429e-07, "loss": 0.0229, "num_tokens": 34724125.0, "reward": 1.1666667461395264, "reward_std": 0.7864982485771179, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.4286984205245972, "sampling/importance_sampling_ratio/mean": 0.9999433159828186, "sampling/importance_sampling_ratio/min": 0.4771292805671692, "sampling/sampling_logp_difference/max": 0.7399678230285645, "sampling/sampling_logp_difference/mean": 0.006126228719949722, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 8039.0, "completions/max_terminated_length": 8039.0, "completions/mean_length": 2887.625, "completions/mean_terminated_length": 2887.625, "completions/min_length": 1674.0, "completions/min_terminated_length": 1674.0, "entropy": 0.28682825341820717, "epoch": 0.2565579014715291, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09211584715738094, "kl": 0.0022189306910149753, "learning_rate": 8.46919300878356e-07, "loss": 0.1555, "num_tokens": 34822700.0, "reward": 1.5, "reward_std": 0.6649550199508667, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.9390701055526733, "sampling/importance_sampling_ratio/mean": 1.0000568628311157, "sampling/importance_sampling_ratio/min": 0.6058487892150879, "sampling/sampling_logp_difference/max": 0.6622085571289062, "sampling/sampling_logp_difference/mean": 0.008489737287163734, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5485.0, "completions/max_terminated_length": 5485.0, "completions/mean_length": 3202.291748046875, "completions/mean_terminated_length": 3202.291748046875, "completions/min_length": 1359.0, "completions/min_terminated_length": 1359.0, "entropy": 0.370882973074913, "epoch": 0.2571976967370441, "frac_reward_zero_std": 0.0, "grad_norm": 0.10927025102319722, "kl": 0.0021533010876737535, "learning_rate": 8.461948779988965e-07, "loss": 0.0484, "num_tokens": 34920755.0, "reward": 1.125, "reward_std": 0.5794824361801147, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.9123631715774536, "sampling/importance_sampling_ratio/mean": 1.0000648498535156, "sampling/importance_sampling_ratio/min": 0.5672289133071899, "sampling/sampling_logp_difference/max": 0.6483397483825684, "sampling/sampling_logp_difference/mean": 0.010930700227618217, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7109.0, "completions/mean_length": 3326.75, "completions/mean_terminated_length": 2884.45458984375, "completions/min_length": 657.0, "completions/min_terminated_length": 657.0, "entropy": 0.2974410466849804, "epoch": 0.2578374920025592, "frac_reward_zero_std": 0.0, "grad_norm": 0.08976932762456018, "kl": 0.0019455200235825032, "learning_rate": 8.454690564912346e-07, "loss": -0.0457, "num_tokens": 35025461.0, "reward": 1.2916667461395264, "reward_std": 0.5625220537185669, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0001816749572754, "sampling/importance_sampling_ratio/min": 0.5099574327468872, "sampling/sampling_logp_difference/max": 1.1125810146331787, "sampling/sampling_logp_difference/mean": 0.00873557012528181, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7095.0, "completions/mean_length": 3788.58349609375, "completions/mean_terminated_length": 3388.27294921875, "completions/min_length": 1139.0, "completions/min_terminated_length": 1139.0, "entropy": 0.3935716599225998, "epoch": 0.25847728726807423, "frac_reward_zero_std": 0.0, "grad_norm": 0.10665432025116096, "kl": 0.0019186319841537625, "learning_rate": 8.447418392876907e-07, "loss": 0.0488, "num_tokens": 35132747.0, "reward": 0.7916666865348816, "reward_std": 0.5914937257766724, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.9498887062072754, "sampling/importance_sampling_ratio/mean": 1.0000079870224, "sampling/importance_sampling_ratio/min": 0.31473252177238464, "sampling/sampling_logp_difference/max": 1.1560320854187012, "sampling/sampling_logp_difference/mean": 0.011024229228496552, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6208.0, "completions/max_terminated_length": 6208.0, "completions/mean_length": 2288.83349609375, "completions/mean_terminated_length": 2288.83349609375, "completions/min_length": 995.0, "completions/min_terminated_length": 995.0, "entropy": 0.30914872139692307, "epoch": 0.2591170825335892, "frac_reward_zero_std": 0.0, "grad_norm": 0.09722626825212252, "kl": 0.0018748754810076207, "learning_rate": 8.440132293262246e-07, "loss": -0.0326, "num_tokens": 35205815.0, "reward": 0.8333333730697632, "reward_std": 0.5260698199272156, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.520294189453125, "sampling/importance_sampling_ratio/mean": 0.9999938607215881, "sampling/importance_sampling_ratio/min": 0.4816569983959198, "sampling/sampling_logp_difference/max": 0.7305231094360352, "sampling/sampling_logp_difference/mean": 0.008535368368029594, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7424.0, "completions/mean_length": 4317.0, "completions/mean_terminated_length": 3964.727294921875, "completions/min_length": 1876.0, "completions/min_terminated_length": 1876.0, "entropy": 0.47424693405628204, "epoch": 0.2597568777991043, "frac_reward_zero_std": 0.0, "grad_norm": 0.09882858297772412, "kl": 0.002283598994836211, "learning_rate": 8.432832295504223e-07, "loss": -0.0342, "num_tokens": 35324599.0, "reward": 0.5, "reward_std": 0.5807350873947144, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.667648434638977, "sampling/importance_sampling_ratio/mean": 0.9999033808708191, "sampling/importance_sampling_ratio/min": 0.14560355246067047, "sampling/sampling_logp_difference/max": 1.9268677234649658, "sampling/sampling_logp_difference/mean": 0.01245282031595707, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4340.0, "completions/max_terminated_length": 4340.0, "completions/mean_length": 2323.83349609375, "completions/mean_terminated_length": 2323.83349609375, "completions/min_length": 1563.0, "completions/min_terminated_length": 1563.0, "entropy": 0.20582865178585052, "epoch": 0.26039667306461933, "frac_reward_zero_std": 0.0, "grad_norm": 0.10525321808826019, "kl": 0.001551418099552393, "learning_rate": 8.425518429094847e-07, "loss": 0.1962, "num_tokens": 35401211.0, "reward": 1.625, "reward_std": 0.7923169136047363, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.5579640865325928, "sampling/importance_sampling_ratio/mean": 1.000210165977478, "sampling/importance_sampling_ratio/min": 0.6680150628089905, "sampling/sampling_logp_difference/max": 0.44337987899780273, "sampling/sampling_logp_difference/mean": 0.006096178665757179, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 5244.0, "completions/mean_length": 2945.75, "completions/mean_terminated_length": 2717.65234375, "completions/min_length": 1241.0, "completions/min_terminated_length": 1241.0, "entropy": 0.3556283861398697, "epoch": 0.26103646833013433, "frac_reward_zero_std": 0.0, "grad_norm": 0.09195998044549017, "kl": 0.0020971007470507175, "learning_rate": 8.418190723582159e-07, "loss": 0.2021, "num_tokens": 35489957.0, "reward": 0.875, "reward_std": 0.7682852149009705, "rewards/accuracy_reward/mean": 0.2916666567325592, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000380277633667, "sampling/importance_sampling_ratio/min": 0.4722258150577545, "sampling/sampling_logp_difference/max": 0.750298023223877, "sampling/sampling_logp_difference/mean": 0.010430891066789627, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6345.0, "completions/max_terminated_length": 6345.0, "completions/mean_length": 3410.58349609375, "completions/mean_terminated_length": 3410.58349609375, "completions/min_length": 1519.0, "completions/min_terminated_length": 1519.0, "entropy": 0.25483885407447815, "epoch": 0.2616762635956494, "frac_reward_zero_std": 0.0, "grad_norm": 0.08138115109856031, "kl": 0.0018614544242154807, "learning_rate": 8.410849208570108e-07, "loss": -0.0347, "num_tokens": 35594547.0, "reward": 0.625, "reward_std": 0.5660459995269775, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.5, "rewards/code_reward/std": 0.5107539296150208, "sampling/importance_sampling_ratio/max": 1.8894238471984863, "sampling/importance_sampling_ratio/mean": 0.9999833106994629, "sampling/importance_sampling_ratio/min": 0.43400731682777405, "sampling/sampling_logp_difference/max": 0.8346939086914062, "sampling/sampling_logp_difference/mean": 0.007058208342641592, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4101.0, "completions/max_terminated_length": 4101.0, "completions/mean_length": 2467.58349609375, "completions/mean_terminated_length": 2467.58349609375, "completions/min_length": 1423.0, "completions/min_terminated_length": 1423.0, "entropy": 0.1780620962381363, "epoch": 0.26231605886116444, "frac_reward_zero_std": 0.0, "grad_norm": 0.09202180297675237, "kl": 0.0017959823599085212, "learning_rate": 8.403493913718431e-07, "loss": 0.0137, "num_tokens": 35673657.0, "reward": 1.4583333730697632, "reward_std": 0.8224833011627197, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.8925451040267944, "sampling/importance_sampling_ratio/mean": 0.999950110912323, "sampling/importance_sampling_ratio/min": 0.4337066411972046, "sampling/sampling_logp_difference/max": 0.8353869915008545, "sampling/sampling_logp_difference/mean": 0.0053551578894257545, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6842.0, "completions/max_terminated_length": 6842.0, "completions/mean_length": 3058.5, "completions/mean_terminated_length": 3058.5, "completions/min_length": 1084.0, "completions/min_terminated_length": 1084.0, "entropy": 0.2916036397218704, "epoch": 0.2629558541266795, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09646924177649437, "kl": 0.0017269774398300797, "learning_rate": 8.39612486874254e-07, "loss": 0.1065, "num_tokens": 35777197.0, "reward": 1.2083333730697632, "reward_std": 0.42645785212516785, "rewards/accuracy_reward/mean": 0.5833333134651184, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.570220708847046, "sampling/importance_sampling_ratio/mean": 0.9998629093170166, "sampling/importance_sampling_ratio/min": 0.5936465859413147, "sampling/sampling_logp_difference/max": 0.5214711427688599, "sampling/sampling_logp_difference/mean": 0.008816095069050789, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3790.0, "completions/max_terminated_length": 3790.0, "completions/mean_length": 2204.875, "completions/mean_terminated_length": 2204.875, "completions/min_length": 1250.0, "completions/min_terminated_length": 1250.0, "entropy": 0.36912185698747635, "epoch": 0.2635956493921945, "frac_reward_zero_std": 0.0, "grad_norm": 0.11960960813418911, "kl": 0.0025751430657692254, "learning_rate": 8.388742103413395e-07, "loss": -0.0256, "num_tokens": 35845466.0, "reward": 1.1666667461395264, "reward_std": 0.7093448638916016, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.463850975036621, "sampling/importance_sampling_ratio/mean": 0.999971330165863, "sampling/importance_sampling_ratio/min": 0.7001603841781616, "sampling/sampling_logp_difference/max": 0.381070613861084, "sampling/sampling_logp_difference/mean": 0.009911742992699146, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5299.0, "completions/max_terminated_length": 5299.0, "completions/mean_length": 2744.291748046875, "completions/mean_terminated_length": 2744.291748046875, "completions/min_length": 1556.0, "completions/min_terminated_length": 1556.0, "entropy": 0.3100289925932884, "epoch": 0.26423544465770954, "frac_reward_zero_std": 0.0, "grad_norm": 0.11962089407921316, "kl": 0.0019164059485774487, "learning_rate": 8.38134564755739e-07, "loss": -0.0049, "num_tokens": 35937665.0, "reward": 1.4583333730697632, "reward_std": 0.8180223703384399, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.92007577419281, "sampling/importance_sampling_ratio/mean": 1.0001071691513062, "sampling/importance_sampling_ratio/min": 0.4439471960067749, "sampling/sampling_logp_difference/max": 0.8120496273040771, "sampling/sampling_logp_difference/mean": 0.00885101966559887, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5359.0, "completions/max_terminated_length": 5359.0, "completions/mean_length": 2656.70849609375, "completions/mean_terminated_length": 2656.70849609375, "completions/min_length": 1276.0, "completions/min_terminated_length": 1276.0, "entropy": 0.34153613448143005, "epoch": 0.2648752399232246, "frac_reward_zero_std": 0.0, "grad_norm": 0.10328713630860505, "kl": 0.0019157144997734576, "learning_rate": 8.373935531056222e-07, "loss": 0.0437, "num_tokens": 36021058.0, "reward": 1.2916667461395264, "reward_std": 0.8079166412353516, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.9286404848098755, "sampling/importance_sampling_ratio/mean": 1.0002719163894653, "sampling/importance_sampling_ratio/min": 0.4664519429206848, "sampling/sampling_logp_difference/max": 0.762600302696228, "sampling/sampling_logp_difference/mean": 0.009251400828361511, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7058.0, "completions/mean_length": 3036.166748046875, "completions/mean_terminated_length": 2567.45458984375, "completions/min_length": 665.0, "completions/min_terminated_length": 665.0, "entropy": 0.2981121502816677, "epoch": 0.2655150351887396, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.06863490453910778, "kl": 0.0031545525998808444, "learning_rate": 8.366511783846783e-07, "loss": 0.1914, "num_tokens": 36111686.0, "reward": 0.2916666865348816, "reward_std": 0.3268197476863861, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/code_reward/mean": 0.2916666567325592, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.520033836364746, "sampling/importance_sampling_ratio/mean": 1.0000022649765015, "sampling/importance_sampling_ratio/min": 0.6320555806159973, "sampling/sampling_logp_difference/max": 0.45877790451049805, "sampling/sampling_logp_difference/mean": 0.0083543099462986, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7511.0, "completions/max_terminated_length": 7511.0, "completions/mean_length": 2772.70849609375, "completions/mean_terminated_length": 2772.70849609375, "completions/min_length": 1327.0, "completions/min_terminated_length": 1327.0, "entropy": 0.3476159796118736, "epoch": 0.26615483045425464, "frac_reward_zero_std": 0.0, "grad_norm": 0.11055083002168681, "kl": 0.0021005047019571066, "learning_rate": 8.359074435921031e-07, "loss": 0.108, "num_tokens": 36193543.0, "reward": 1.125, "reward_std": 0.695723295211792, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.5915985107421875, "sampling/importance_sampling_ratio/mean": 1.0000510215759277, "sampling/importance_sampling_ratio/min": 0.6172606945037842, "sampling/sampling_logp_difference/max": 0.4824638366699219, "sampling/sampling_logp_difference/mean": 0.009225226938724518, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7329.0, "completions/max_terminated_length": 7329.0, "completions/mean_length": 3168.83349609375, "completions/mean_terminated_length": 3168.83349609375, "completions/min_length": 1100.0, "completions/min_terminated_length": 1100.0, "entropy": 0.32629943639039993, "epoch": 0.2667946257197697, "frac_reward_zero_std": 0.0, "grad_norm": 0.09703056854114053, "kl": 0.002082445746054873, "learning_rate": 8.35162351732587e-07, "loss": 0.1502, "num_tokens": 36281707.0, "reward": 1.0, "reward_std": 0.5605829954147339, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.6666666865348816, "rewards/code_reward/std": 0.4815434217453003, "sampling/importance_sampling_ratio/max": 1.619848608970642, "sampling/importance_sampling_ratio/mean": 0.9998509287834167, "sampling/importance_sampling_ratio/min": 0.47908052802085876, "sampling/sampling_logp_difference/max": 0.7358865737915039, "sampling/sampling_logp_difference/mean": 0.009420981630682945, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6785.0, "completions/max_terminated_length": 6785.0, "completions/mean_length": 3078.666748046875, "completions/mean_terminated_length": 3078.666748046875, "completions/min_length": 824.0, "completions/min_terminated_length": 824.0, "entropy": 0.3733624666929245, "epoch": 0.2674344209852847, "frac_reward_zero_std": 0.0, "grad_norm": 0.10542020411095511, "kl": 0.0024527779896743596, "learning_rate": 8.34415905816303e-07, "loss": 0.0496, "num_tokens": 36375435.0, "reward": 1.2916667461395264, "reward_std": 0.6784341335296631, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.4108213186264038, "sampling/importance_sampling_ratio/mean": 1.0000213384628296, "sampling/importance_sampling_ratio/min": 0.6356550455093384, "sampling/sampling_logp_difference/max": 0.45309925079345703, "sampling/sampling_logp_difference/mean": 0.009906777180731297, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5187.0, "completions/max_terminated_length": 5187.0, "completions/mean_length": 2601.95849609375, "completions/mean_terminated_length": 2601.95849609375, "completions/min_length": 779.0, "completions/min_terminated_length": 779.0, "entropy": 0.37434203177690506, "epoch": 0.26807421625079975, "frac_reward_zero_std": 0.0, "grad_norm": 0.102546109108712, "kl": 0.002511319355107844, "learning_rate": 8.336681088588947e-07, "loss": -0.0597, "num_tokens": 36447746.0, "reward": 1.0416667461395264, "reward_std": 0.7714905738830566, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.6519502401351929, "sampling/importance_sampling_ratio/mean": 0.9998770356178284, "sampling/importance_sampling_ratio/min": 0.6770451664924622, "sampling/sampling_logp_difference/max": 0.501956582069397, "sampling/sampling_logp_difference/mean": 0.010746121406555176, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5587.0, "completions/max_terminated_length": 5587.0, "completions/mean_length": 2954.5, "completions/mean_terminated_length": 2954.5, "completions/min_length": 863.0, "completions/min_terminated_length": 863.0, "entropy": 0.39315520226955414, "epoch": 0.2687140115163148, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08315541944986601, "kl": 0.00234442378859967, "learning_rate": 8.329189638814635e-07, "loss": 0.019, "num_tokens": 36536694.0, "reward": 1.5, "reward_std": 0.5261822938919067, "rewards/accuracy_reward/mean": 0.6666666865348816, "rewards/accuracy_reward/std": 0.4815434217453003, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.721627116203308, "sampling/importance_sampling_ratio/mean": 0.999760091304779, "sampling/importance_sampling_ratio/min": 0.5060985684394836, "sampling/sampling_logp_difference/max": 0.6810238361358643, "sampling/sampling_logp_difference/mean": 0.010959936305880547, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0416666679084301, "completions/max_length": 8192.0, "completions/max_terminated_length": 7773.0, "completions/mean_length": 3555.166748046875, "completions/mean_terminated_length": 3353.565185546875, "completions/min_length": 1538.0, "completions/min_terminated_length": 1538.0, "entropy": 0.3307937830686569, "epoch": 0.2693538067818298, "frac_reward_zero_std": 0.0, "grad_norm": 0.08660260155856765, "kl": 0.002415671944618225, "learning_rate": 8.321684739105572e-07, "loss": -0.0243, "num_tokens": 36640066.0, "reward": 0.5416666865348816, "reward_std": 0.695723295211792, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.337831974029541, "rewards/code_reward/mean": 0.4166666567325592, "rewards/code_reward/std": 0.5036101937294006, "sampling/importance_sampling_ratio/max": 1.876548409461975, "sampling/importance_sampling_ratio/mean": 0.9999459385871887, "sampling/importance_sampling_ratio/min": 0.6155917048454285, "sampling/sampling_logp_difference/max": 0.6294341087341309, "sampling/sampling_logp_difference/mean": 0.008573142811655998, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2559.0, "completions/max_terminated_length": 2559.0, "completions/mean_length": 1720.2083740234375, "completions/mean_terminated_length": 1720.2083740234375, "completions/min_length": 13.0, "completions/min_terminated_length": 13.0, "entropy": 0.2097198348492384, "epoch": 0.26999360204734485, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.072644529796673, "kl": 0.011781079752836376, "learning_rate": 8.314166419781575e-07, "loss": -0.0933, "num_tokens": 36707959.0, "reward": 1.25, "reward_std": 0.5920505523681641, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.7916666865348816, "rewards/code_reward/std": 0.4148511290550232, "sampling/importance_sampling_ratio/max": 1.4652400016784668, "sampling/importance_sampling_ratio/mean": 0.9999802708625793, "sampling/importance_sampling_ratio/min": 0.5423303246498108, "sampling/sampling_logp_difference/max": 0.6118800640106201, "sampling/sampling_logp_difference/mean": 0.0039399596862494946, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7981.0, "completions/max_terminated_length": 7981.0, "completions/mean_length": 2152.166748046875, "completions/mean_terminated_length": 2152.166748046875, "completions/min_length": 1134.0, "completions/min_terminated_length": 1134.0, "entropy": 0.26274942979216576, "epoch": 0.2706333973128599, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09453076494498439, "kl": 0.0021337087091524154, "learning_rate": 8.306634711216666e-07, "loss": -0.1809, "num_tokens": 36770619.0, "reward": 1.625, "reward_std": 0.4493255615234375, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.9166666865348816, "rewards/code_reward/std": 0.28232985734939575, "sampling/importance_sampling_ratio/max": 1.8940261602401733, "sampling/importance_sampling_ratio/mean": 1.0000529289245605, "sampling/importance_sampling_ratio/min": 0.7035578489303589, "sampling/sampling_logp_difference/max": 0.638704776763916, "sampling/sampling_logp_difference/mean": 0.007624979596585035, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4847.0, "completions/max_terminated_length": 4847.0, "completions/mean_length": 2764.83349609375, "completions/mean_terminated_length": 2764.83349609375, "completions/min_length": 1331.0, "completions/min_terminated_length": 1331.0, "entropy": 0.3805844113230705, "epoch": 0.2712731925783749, "frac_reward_zero_std": 0.0, "grad_norm": 0.10120208623396883, "kl": 0.0025000913883559406, "learning_rate": 8.299089643838975e-07, "loss": 0.0318, "num_tokens": 36851543.0, "reward": 0.9166666865348816, "reward_std": 0.6015613079071045, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 2.0, "sampling/importance_sampling_ratio/mean": 1.0000791549682617, "sampling/importance_sampling_ratio/min": 0.5073050856590271, "sampling/sampling_logp_difference/max": 1.8601646423339844, "sampling/sampling_logp_difference/mean": 0.010719021782279015, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7481.0, "completions/max_terminated_length": 7481.0, "completions/mean_length": 3301.33349609375, "completions/mean_terminated_length": 3301.33349609375, "completions/min_length": 1360.0, "completions/min_terminated_length": 1360.0, "entropy": 0.3298930451273918, "epoch": 0.27191298784388995, "frac_reward_zero_std": 0.0, "grad_norm": 0.08406023627666914, "kl": 0.002403023128863424, "learning_rate": 8.291531248130588e-07, "loss": 0.0777, "num_tokens": 36942855.0, "reward": 1.125, "reward_std": 0.6171872615814209, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.823217749595642, "sampling/importance_sampling_ratio/mean": 1.0000450611114502, "sampling/importance_sampling_ratio/min": 0.5005224943161011, "sampling/sampling_logp_difference/max": 0.6921026706695557, "sampling/sampling_logp_difference/mean": 0.009452614933252335, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6079.0, "completions/max_terminated_length": 6079.0, "completions/mean_length": 2915.875, "completions/mean_terminated_length": 2915.875, "completions/min_length": 1725.0, "completions/min_terminated_length": 1725.0, "entropy": 0.24292847886681557, "epoch": 0.272552783109405, "frac_reward_zero_std": 0.0, "grad_norm": 0.08346945126343792, "kl": 0.001862907491158694, "learning_rate": 8.283959554627446e-07, "loss": 0.029, "num_tokens": 37043316.0, "reward": 1.1666667461395264, "reward_std": 0.9557830691337585, "rewards/accuracy_reward/mean": 0.5416666865348816, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.625, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.5649055242538452, "sampling/importance_sampling_ratio/mean": 1.0000805854797363, "sampling/importance_sampling_ratio/min": 0.56679767370224, "sampling/sampling_logp_difference/max": 0.5677528381347656, "sampling/sampling_logp_difference/mean": 0.007125508040189743, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 7154.0, "completions/mean_length": 3360.875, "completions/mean_terminated_length": 2921.681884765625, "completions/min_length": 1033.0, "completions/min_terminated_length": 1033.0, "entropy": 0.36808905750513077, "epoch": 0.27319257837492, "frac_reward_zero_std": 0.0, "grad_norm": 0.09281787261897878, "kl": 0.001967532589333132, "learning_rate": 8.27637459391921e-07, "loss": -0.0574, "num_tokens": 37140505.0, "reward": 0.7916666865348816, "reward_std": 0.5078567266464233, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.7745301723480225, "sampling/importance_sampling_ratio/mean": 1.0000265836715698, "sampling/importance_sampling_ratio/min": 0.6334691643714905, "sampling/sampling_logp_difference/max": 0.573535680770874, "sampling/sampling_logp_difference/mean": 0.009703259915113449, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5544.0, "completions/max_terminated_length": 5544.0, "completions/mean_length": 2981.75, "completions/mean_terminated_length": 2981.75, "completions/min_length": 1364.0, "completions/min_terminated_length": 1364.0, "entropy": 0.4301706328988075, "epoch": 0.27383237364043506, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.08174864055142075, "kl": 0.002624788088724017, "learning_rate": 8.268776396649143e-07, "loss": -0.0471, "num_tokens": 37230723.0, "reward": 0.9166666865348816, "reward_std": 0.510651707649231, "rewards/accuracy_reward/mean": 0.1666666716337204, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.731995701789856, "sampling/importance_sampling_ratio/mean": 0.9998424053192139, "sampling/importance_sampling_ratio/min": 0.40413203835487366, "sampling/sampling_logp_difference/max": 0.9060136079788208, "sampling/sampling_logp_difference/mean": 0.011503275483846664, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3607.0, "completions/max_terminated_length": 3607.0, "completions/mean_length": 1961.291748046875, "completions/mean_terminated_length": 1961.291748046875, "completions/min_length": 963.0, "completions/min_terminated_length": 963.0, "entropy": 0.20187317952513695, "epoch": 0.2744721689059501, "frac_reward_zero_std": 0.0, "grad_norm": 0.10115848838027847, "kl": 0.001894430664833635, "learning_rate": 8.261164993513977e-07, "loss": 0.0978, "num_tokens": 37299882.0, "reward": 1.5, "reward_std": 0.6571635007858276, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.769805908203125, "sampling/importance_sampling_ratio/mean": 1.0000133514404297, "sampling/importance_sampling_ratio/min": 0.5936793684959412, "sampling/sampling_logp_difference/max": 0.5708699226379395, "sampling/sampling_logp_difference/mean": 0.006390755996108055, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5386.0, "completions/max_terminated_length": 5386.0, "completions/mean_length": 2903.08349609375, "completions/mean_terminated_length": 2903.08349609375, "completions/min_length": 1630.0, "completions/min_terminated_length": 1630.0, "entropy": 0.2931368947029114, "epoch": 0.2751119641714651, "frac_reward_zero_std": 0.0, "grad_norm": 0.10374955622192475, "kl": 0.0019125625549349934, "learning_rate": 8.253540415263805e-07, "loss": -0.0153, "num_tokens": 37391764.0, "reward": 1.375, "reward_std": 0.6712342500686646, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.875, "rewards/code_reward/std": 0.337831974029541, "sampling/importance_sampling_ratio/max": 1.8089662790298462, "sampling/importance_sampling_ratio/mean": 0.9999495148658752, "sampling/importance_sampling_ratio/min": 0.6127456426620483, "sampling/sampling_logp_difference/max": 0.5927555561065674, "sampling/sampling_logp_difference/mean": 0.00836273841559887, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6257.0, "completions/max_terminated_length": 6257.0, "completions/mean_length": 3944.70849609375, "completions/mean_terminated_length": 3944.70849609375, "completions/min_length": 1774.0, "completions/min_terminated_length": 1774.0, "entropy": 0.4687158912420273, "epoch": 0.27575175943698016, "frac_reward_zero_std": 0.0, "grad_norm": 0.09188956403006494, "kl": 0.0026504829293116927, "learning_rate": 8.245902692701938e-07, "loss": 0.1499, "num_tokens": 37497733.0, "reward": 0.7083333730697632, "reward_std": 0.8263596296310425, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4423258602619171, "rewards/code_reward/mean": 0.4583333432674408, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.9128600358963013, "sampling/importance_sampling_ratio/mean": 0.9999690651893616, "sampling/importance_sampling_ratio/min": 0.2673325538635254, "sampling/sampling_logp_difference/max": 1.3192617893218994, "sampling/sampling_logp_difference/mean": 0.012269515544176102, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0833333358168602, "completions/max_length": 8192.0, "completions/max_terminated_length": 8062.0, "completions/mean_length": 3249.916748046875, "completions/mean_terminated_length": 2800.636474609375, "completions/min_length": 1071.0, "completions/min_terminated_length": 1071.0, "entropy": 0.2426382675766945, "epoch": 0.2763915547024952, "frac_reward_zero_std": 0.0, "grad_norm": 0.09476625333891327, "kl": 0.0019275251834187657, "learning_rate": 8.238251856684799e-07, "loss": -0.1114, "num_tokens": 37594339.0, "reward": 1.125, "reward_std": 0.5625219941139221, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101341247559, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.7565516233444214, "sampling/importance_sampling_ratio/mean": 1.000087857246399, "sampling/importance_sampling_ratio/min": 0.36955180764198303, "sampling/sampling_logp_difference/max": 0.9954643249511719, "sampling/sampling_logp_difference/mean": 0.00734469760209322, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6545.0, "completions/max_terminated_length": 6545.0, "completions/mean_length": 3014.375, "completions/mean_terminated_length": 3014.375, "completions/min_length": 1491.0, "completions/min_terminated_length": 1491.0, "entropy": 0.3519033119082451, "epoch": 0.2770313499680102, "frac_reward_zero_std": 0.0, "grad_norm": 0.11912269618639315, "kl": 0.002749658131506294, "learning_rate": 8.230587938121782e-07, "loss": 0.0886, "num_tokens": 37687396.0, "reward": 1.25, "reward_std": 0.7135938405990601, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5107539296150208, "rewards/code_reward/mean": 0.75, "rewards/code_reward/std": 0.4423258602619171, "sampling/importance_sampling_ratio/max": 1.4350476264953613, "sampling/importance_sampling_ratio/mean": 1.0000104904174805, "sampling/importance_sampling_ratio/min": 0.5019360780715942, "sampling/sampling_logp_difference/max": 0.6892825365066528, "sampling/sampling_logp_difference/mean": 0.009997429326176643, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7080.0, "completions/max_terminated_length": 7080.0, "completions/mean_length": 3091.58349609375, "completions/mean_terminated_length": 3091.58349609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3540499582886696, "epoch": 0.27767114523352526, "frac_reward_zero_std": 0.0, "grad_norm": 0.09047856605950946, "kl": 0.01592088956385851, "learning_rate": 8.222910967975143e-07, "loss": 0.0277, "num_tokens": 37786658.0, "reward": 1.0416667461395264, "reward_std": 0.7828061580657959, "rewards/accuracy_reward/mean": 0.3333333432674408, "rewards/accuracy_reward/std": 0.4815433919429779, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.9226285219192505, "sampling/importance_sampling_ratio/mean": 0.99992436170578, "sampling/importance_sampling_ratio/min": 0.4685935080051422, "sampling/sampling_logp_difference/max": 0.7580196857452393, "sampling/sampling_logp_difference/mean": 0.009378405287861824, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5861.0, "completions/max_terminated_length": 5861.0, "completions/mean_length": 2810.33349609375, "completions/mean_terminated_length": 2810.33349609375, "completions/min_length": 1107.0, "completions/min_terminated_length": 1107.0, "entropy": 0.4235195219516754, "epoch": 0.2783109404990403, "frac_reward_zero_std": 0.6666666865348816, "grad_norm": 0.03629075281885533, "kl": 0.0023612062796019018, "learning_rate": 8.215220977259855e-07, "loss": 0.0218, "num_tokens": 37873610.0, "reward": 0.4166666865348816, "reward_std": 0.2357022613286972, "rewards/accuracy_reward/mean": 0.0416666679084301, "rewards/accuracy_reward/std": 0.20412413775920868, "rewards/code_reward/mean": 0.375, "rewards/code_reward/std": 0.494535356760025, "sampling/importance_sampling_ratio/max": 1.6514791250228882, "sampling/importance_sampling_ratio/mean": 1.0000368356704712, "sampling/importance_sampling_ratio/min": 0.4074474275112152, "sampling/sampling_logp_difference/max": 0.8978433609008789, "sampling/sampling_logp_difference/mean": 0.01216711476445198, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5746.0, "completions/max_terminated_length": 5746.0, "completions/mean_length": 2400.83349609375, "completions/mean_terminated_length": 2400.83349609375, "completions/min_length": 680.0, "completions/min_terminated_length": 680.0, "entropy": 0.3942362889647484, "epoch": 0.27895073576455537, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.0991785430464145, "kl": 0.0023258228902705014, "learning_rate": 8.207517997043504e-07, "loss": -0.0786, "num_tokens": 37950502.0, "reward": 1.4166667461395264, "reward_std": 0.5533831119537354, "rewards/accuracy_reward/mean": 0.7083333134651184, "rewards/accuracy_reward/std": 0.4643056094646454, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.5073353052139282, "sampling/importance_sampling_ratio/mean": 0.9999756813049316, "sampling/importance_sampling_ratio/min": 0.4173964560031891, "sampling/sampling_logp_difference/max": 0.8737187385559082, "sampling/sampling_logp_difference/mean": 0.011207791045308113, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 5428.0, "completions/max_terminated_length": 5428.0, "completions/mean_length": 2971.166748046875, "completions/mean_terminated_length": 2971.166748046875, "completions/min_length": 1417.0, "completions/min_terminated_length": 1417.0, "entropy": 0.32748908549547195, "epoch": 0.27959053103007037, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.09456579642453632, "kl": 0.002161363314371556, "learning_rate": 8.19980205844615e-07, "loss": -0.0654, "num_tokens": 38044442.0, "reward": 1.0416667461395264, "reward_std": 0.42645785212516785, "rewards/accuracy_reward/mean": 0.4583333432674408, "rewards/accuracy_reward/std": 0.5089773535728455, "rewards/code_reward/mean": 0.5833333134651184, "rewards/code_reward/std": 0.5036101341247559, "sampling/importance_sampling_ratio/max": 1.6317384243011475, "sampling/importance_sampling_ratio/mean": 0.9997601509094238, "sampling/importance_sampling_ratio/min": 0.34113195538520813, "sampling/sampling_logp_difference/max": 1.0754859447479248, "sampling/sampling_logp_difference/mean": 0.009061060845851898, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2445.0, "completions/max_terminated_length": 2445.0, "completions/mean_length": 1488.2083740234375, "completions/mean_terminated_length": 1488.2083740234375, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "entropy": 0.1679430976510048, "epoch": 0.2802303262955854, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10798158533416873, "kl": 0.0026455525076016784, "learning_rate": 8.192073192640203e-07, "loss": -0.0632, "num_tokens": 38098367.0, "reward": 1.6666667461395264, "reward_std": 0.5533831119537354, "rewards/accuracy_reward/mean": 0.8333333134651184, "rewards/accuracy_reward/std": 0.3806934952735901, "rewards/code_reward/mean": 0.8333333134651184, "rewards/code_reward/std": 0.3806934952735901, "sampling/importance_sampling_ratio/max": 1.4584249258041382, "sampling/importance_sampling_ratio/mean": 1.0001482963562012, "sampling/importance_sampling_ratio/min": 0.2795843482017517, "sampling/sampling_logp_difference/max": 1.2744512557983398, "sampling/sampling_logp_difference/mean": 0.005401706323027611, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6257.0, "completions/max_terminated_length": 6257.0, "completions/mean_length": 2899.416748046875, "completions/mean_terminated_length": 2899.416748046875, "completions/min_length": 1514.0, "completions/min_terminated_length": 1514.0, "entropy": 0.23880596458911896, "epoch": 0.28087012156110047, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.07670608958098071, "kl": 0.0023960391408763826, "learning_rate": 8.184331430850302e-07, "loss": 0.0645, "num_tokens": 38188561.0, "reward": 0.9166666865348816, "reward_std": 0.487678587436676, "rewards/accuracy_reward/mean": 0.375, "rewards/accuracy_reward/std": 0.494535356760025, "rewards/code_reward/mean": 0.5416666865348816, "rewards/code_reward/std": 0.5089773535728455, "sampling/importance_sampling_ratio/max": 1.4879891872406006, "sampling/importance_sampling_ratio/mean": 1.0001147985458374, "sampling/importance_sampling_ratio/min": 0.6306875944137573, "sampling/sampling_logp_difference/max": 0.46094465255737305, "sampling/sampling_logp_difference/mean": 0.006848443299531937, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 4413.0, "completions/max_terminated_length": 4413.0, "completions/mean_length": 2086.625, "completions/mean_terminated_length": 2086.625, "completions/min_length": 1028.0, "completions/min_terminated_length": 1028.0, "entropy": 0.3047335222363472, "epoch": 0.28150991682661547, "frac_reward_zero_std": 0.3333333432674408, "grad_norm": 0.10374583467473857, "kl": 0.002403319056611508, "learning_rate": 8.176576804353185e-07, "loss": -0.0084, "num_tokens": 38256944.0, "reward": 1.125, "reward_std": 0.2721545100212097, "rewards/accuracy_reward/mean": 0.4166666567325592, "rewards/accuracy_reward/std": 0.5036101937294006, "rewards/code_reward/mean": 0.7083333134651184, "rewards/code_reward/std": 0.4643056094646454, "sampling/importance_sampling_ratio/max": 1.7537635564804077, "sampling/importance_sampling_ratio/mean": 0.9999677538871765, "sampling/importance_sampling_ratio/min": 0.6887691020965576, "sampling/sampling_logp_difference/max": 0.561764121055603, "sampling/sampling_logp_difference/mean": 0.00889765378087759, "step": 440 } ], "logging_steps": 1.0, "max_steps": 1563, "num_input_tokens_seen": 38256944, "num_train_epochs": 1, "save_steps": 20, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }