{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.0008, "eval_steps": 500, "global_step": 100, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1790.0, "completions/max_terminated_length": 1790.0, "completions/mean_length": 182.1484375, "completions/mean_terminated_length": 182.1484375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.28799160569906235, "epoch": 8e-06, "frac_reward_zero_std": 0.5625, "grad_norm": 0.015653066337108612, "kl": 9.092812547351059e-07, "learning_rate": 0.0, "loss": 0.0006, "num_tokens": 572435.0, "reward": 0.5309156775474548, "reward_std": 0.4604809284210205, "rewards/reward_func/mean": 0.5309156775474548, "rewards/reward_func/std": 0.4604808986186981, "sampling/importance_sampling_ratio/max": 1.9294580221176147, "sampling/importance_sampling_ratio/mean": 1.009232997894287, "sampling/importance_sampling_ratio/min": 0.34108418226242065, "sampling/sampling_logp_difference/max": 0.4118213653564453, "sampling/sampling_logp_difference/mean": 0.006867324002087116, "step": 1, "step_time": 92.00019569275901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2680463492870331, "epoch": 1.6e-05, "grad_norm": 0.030799396336078644, "kl": 3.06405117722619e-07, "learning_rate": 5e-06, "loss": -0.0008, "step": 2, "step_time": 32.768778500845656 }, { "clip_ratio/high_max": 0.0007134150364436209, "clip_ratio/high_mean": 8.917687955545262e-05, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 8.917687955545262e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 5461.0, "completions/max_terminated_length": 5461.0, "completions/mean_length": 492.9140625, "completions/mean_terminated_length": 492.9140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2738131880760193, "epoch": 2.4e-05, "frac_reward_zero_std": 0.3125, "grad_norm": 0.04047725349664688, "kl": 0.00043132787686772645, "learning_rate": 1e-05, "loss": 0.0128, "num_tokens": 1023912.0, "reward": 0.36626869440078735, "reward_std": 0.465108722448349, "rewards/reward_func/mean": 0.36626869440078735, "rewards/reward_func/std": 0.465108722448349, "sampling/importance_sampling_ratio/max": 2.1430017948150635, "sampling/importance_sampling_ratio/mean": 0.9558022022247314, "sampling/importance_sampling_ratio/min": 0.020946042612195015, "sampling/sampling_logp_difference/max": 1.0668578147888184, "sampling/sampling_logp_difference/mean": 0.012038183398544788, "step": 3, "step_time": 149.4881290521007 }, { "clip_ratio/high_max": 0.000916204895474948, "clip_ratio/high_mean": 0.0001145256119343685, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001145256119343685, "entropy": 0.3138197138905525, "epoch": 3.2e-05, "grad_norm": 0.022033225744962692, "kl": 0.0005088059915578924, "learning_rate": 1.5e-05, "loss": -0.025, "step": 4, "step_time": 58.02533454587683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8909.0, "completions/mean_length": 1400.8125, "completions/mean_terminated_length": 401.933349609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3101617470383644, "epoch": 4e-05, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0025309158954769373, "kl": 0.0005323870427673683, "learning_rate": 2e-05, "loss": -0.0, "num_tokens": 1820264.0, "reward": 0.4536225199699402, "reward_std": 0.49777504801750183, "rewards/reward_func/mean": 0.4536225199699402, "rewards/reward_func/std": 0.49777501821517944, "sampling/importance_sampling_ratio/max": 1.2027839422225952, "sampling/importance_sampling_ratio/mean": 0.8852319717407227, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.777749061584473, "sampling/sampling_logp_difference/mean": 0.01063137874007225, "step": 5, "step_time": 239.55608439911157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.3320774510502815, "epoch": 4.8e-05, "grad_norm": 0.002884262939915061, "kl": 0.0017297266749665141, "learning_rate": 2.5e-05, "loss": 0.0, "step": 6, "step_time": 59.483061871957034 }, { "clip_ratio/high_max": 0.0013319647405296564, "clip_ratio/high_mean": 0.00016649559256620705, "clip_ratio/low_mean": 0.00011880823876708746, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002853038313332945, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 12890.0, "completions/mean_length": 2285.6953125, "completions/mean_terminated_length": 960.2137451171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.28958937525749207, "epoch": 5.6e-05, "frac_reward_zero_std": 0.4375, "grad_norm": 0.0031948399264365435, "kl": 0.0007329104555537924, "learning_rate": 3e-05, "loss": -0.0075, "num_tokens": 2596929.0, "reward": 0.37591269612312317, "reward_std": 0.46268102526664734, "rewards/reward_func/mean": 0.37591269612312317, "rewards/reward_func/std": 0.46268102526664734, "sampling/importance_sampling_ratio/max": 1.3332258462905884, "sampling/importance_sampling_ratio/mean": 0.7790708541870117, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.7908563613891602, "sampling/sampling_logp_difference/mean": 0.011350465007126331, "step": 7, "step_time": 291.5669959378429 }, { "clip_ratio/high_max": 0.06269620433158707, "clip_ratio/high_mean": 0.007905740383648663, "clip_ratio/low_mean": 3.790311166085303e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007943643497128505, "entropy": 0.27083906158804893, "epoch": 6.4e-05, "grad_norm": 0.0021086863707751036, "kl": 0.0014875386259518564, "learning_rate": 3.5e-05, "loss": -0.0005, "step": 8, "step_time": 86.35025516920723 }, { "clip_ratio/high_max": 0.0006997402815613896, "clip_ratio/high_mean": 0.00015431304200319573, "clip_ratio/low_mean": 0.00010484526228538016, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002591583006505971, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14202.0, "completions/mean_length": 2823.390625, "completions/mean_terminated_length": 1797.7984619140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14772238209843636, "epoch": 7.2e-05, "frac_reward_zero_std": 0.625, "grad_norm": 0.006001872010529041, "kl": 0.000943778213695623, "learning_rate": 4e-05, "loss": 0.0108, "num_tokens": 3207611.0, "reward": 0.5036642551422119, "reward_std": 0.49157199263572693, "rewards/reward_func/mean": 0.5036642551422119, "rewards/reward_func/std": 0.49157199263572693, "sampling/importance_sampling_ratio/max": 1.5270230770111084, "sampling/importance_sampling_ratio/mean": 0.7227185964584351, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.1080689430236816, "sampling/sampling_logp_difference/mean": 0.004280484281480312, "step": 9, "step_time": 252.9167389899958 }, { "clip_ratio/high_max": 0.001605564437340945, "clip_ratio/high_mean": 0.00024234261945821345, "clip_ratio/low_mean": 9.370225598104298e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00033604487907723524, "entropy": 0.19918649271130562, "epoch": 8e-05, "grad_norm": 0.0017423235112801194, "kl": 0.0017632123199291527, "learning_rate": 4.5e-05, "loss": 0.0011, "step": 10, "step_time": 71.32449517771602 }, { "clip_ratio/high_max": 0.0016184170162887312, "clip_ratio/high_mean": 0.0002023021270360914, "clip_ratio/low_mean": 2.689328721316997e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00022919541515875608, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15886.0, "completions/mean_length": 1933.03125, "completions/mean_terminated_length": 708.3728637695312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.283096544444561, "epoch": 8.8e-05, "frac_reward_zero_std": 0.375, "grad_norm": 0.005225648172199726, "kl": 0.020552616333588958, "learning_rate": 5e-05, "loss": -0.0047, "num_tokens": 3911607.0, "reward": 0.3723485767841339, "reward_std": 0.45915600657463074, "rewards/reward_func/mean": 0.3723485767841339, "rewards/reward_func/std": 0.45915600657463074, "sampling/importance_sampling_ratio/max": 2.6595818996429443, "sampling/importance_sampling_ratio/mean": 0.8330748081207275, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.817470550537109, "sampling/sampling_logp_difference/mean": 0.018553579226136208, "step": 11, "step_time": 305.5446167134214 }, { "clip_ratio/high_max": 0.11326734800240956, "clip_ratio/high_mean": 0.014203241193172289, "clip_ratio/low_mean": 0.02032394427806139, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03452718561311485, "entropy": 0.31704550981521606, "epoch": 9.6e-05, "grad_norm": 0.0021242008078843355, "kl": 0.041884748614393175, "learning_rate": 5.500000000000001e-05, "loss": -0.0024, "step": 12, "step_time": 100.18367045000196 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14215.0, "completions/mean_length": 994.609375, "completions/mean_terminated_length": 498.1773986816406, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21508953720331192, "epoch": 0.000104, "frac_reward_zero_std": 0.4375, "grad_norm": 0.008786521852016449, "kl": 0.010858730238396674, "learning_rate": 6e-05, "loss": 0.0112, "num_tokens": 4508533.0, "reward": 0.49886971712112427, "reward_std": 0.479078471660614, "rewards/reward_func/mean": 0.49886971712112427, "rewards/reward_func/std": 0.479078471660614, "sampling/importance_sampling_ratio/max": 1.2790054082870483, "sampling/importance_sampling_ratio/mean": 0.9344986081123352, "sampling/importance_sampling_ratio/min": 1.8428319634167245e-11, "sampling/sampling_logp_difference/max": 1.4877722263336182, "sampling/sampling_logp_difference/mean": 0.008521802723407745, "step": 13, "step_time": 443.75575554184616 }, { "clip_ratio/high_max": 0.20842555643321248, "clip_ratio/high_mean": 0.031284590383620525, "clip_ratio/low_mean": 0.03960632954840548, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07089091930538416, "entropy": 0.18631769344210625, "epoch": 0.000112, "grad_norm": 0.003839960554614663, "kl": 0.3403822723776102, "learning_rate": 6.500000000000001e-05, "loss": -0.0202, "step": 14, "step_time": 85.95637208526023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0001768385773175396, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0001768385773175396, "completions/clipped_ratio": 0.0, "completions/max_length": 5468.0, "completions/max_terminated_length": 5468.0, "completions/mean_length": 578.9375, "completions/mean_terminated_length": 578.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.159299585968256, "epoch": 0.00012, "frac_reward_zero_std": 0.5625, "grad_norm": 0.012311691418290138, "kl": 0.01162221294362098, "learning_rate": 7e-05, "loss": 0.0128, "num_tokens": 5117069.0, "reward": 0.5088721513748169, "reward_std": 0.4710608422756195, "rewards/reward_func/mean": 0.5088721513748169, "rewards/reward_func/std": 0.4710608124732971, "sampling/importance_sampling_ratio/max": 2.0296452045440674, "sampling/importance_sampling_ratio/mean": 0.9467288255691528, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.81820821762085, "sampling/sampling_logp_difference/mean": 0.011304730549454689, "step": 15, "step_time": 139.54756120592356 }, { "clip_ratio/high_max": 0.1666666716337204, "clip_ratio/high_mean": 0.02633665595203638, "clip_ratio/low_mean": 0.027520230985828675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.053856888320297, "entropy": 0.24024979025125504, "epoch": 0.000128, "grad_norm": 0.004696316551417112, "kl": 0.04927169228903949, "learning_rate": 7.500000000000001e-05, "loss": 0.0008, "step": 16, "step_time": 51.39879226591438 }, { "clip_ratio/high_max": 0.001534179231384769, "clip_ratio/high_mean": 0.00019177240392309614, "clip_ratio/low_mean": 5.4063617426436394e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00024583602498751134, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12325.0, "completions/mean_length": 2109.71875, "completions/mean_terminated_length": 900.0338745117188, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.259117666631937, "epoch": 0.000136, "frac_reward_zero_std": 0.375, "grad_norm": 0.0027392900083214045, "kl": 0.022709450451657176, "learning_rate": 8e-05, "loss": -0.0015, "num_tokens": 5846785.0, "reward": 0.5021514892578125, "reward_std": 0.4946684241294861, "rewards/reward_func/mean": 0.5021514892578125, "rewards/reward_func/std": 0.4946684241294861, "sampling/importance_sampling_ratio/max": 1.3155046701431274, "sampling/importance_sampling_ratio/mean": 0.8164673447608948, "sampling/importance_sampling_ratio/min": 3.531794431563262e-12, "sampling/sampling_logp_difference/max": 1.9771251678466797, "sampling/sampling_logp_difference/mean": 0.010305984877049923, "step": 17, "step_time": 405.8358749570325 }, { "clip_ratio/high_max": 0.1666666716337204, "clip_ratio/high_mean": 0.021097486838698387, "clip_ratio/low_mean": 0.011532710865139961, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03263019863516092, "entropy": 0.1985614150762558, "epoch": 0.000144, "grad_norm": 0.005495882593095303, "kl": 0.03126369323581457, "learning_rate": 8.5e-05, "loss": -0.0155, "step": 18, "step_time": 142.6410668361932 }, { "clip_ratio/high_max": 0.0031972584838513285, "clip_ratio/high_mean": 0.0005761296160926577, "clip_ratio/low_mean": 0.0004054550954606384, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009815847006393597, "completions/clipped_ratio": 0.1796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16000.0, "completions/mean_length": 4290.2109375, "completions/mean_terminated_length": 1641.0953369140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09939680807292461, "epoch": 0.000152, "frac_reward_zero_std": 0.375, "grad_norm": 0.004325419198721647, "kl": 0.025787187332753092, "learning_rate": 9e-05, "loss": 0.0035, "num_tokens": 6617028.0, "reward": 0.4427623152732849, "reward_std": 0.4857458472251892, "rewards/reward_func/mean": 0.4427623152732849, "rewards/reward_func/std": 0.4857458472251892, "sampling/importance_sampling_ratio/max": 1.2794967889785767, "sampling/importance_sampling_ratio/mean": 0.6535032987594604, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 3.10308837890625, "sampling/sampling_logp_difference/mean": 0.007126981392502785, "step": 19, "step_time": 253.69875198067166 }, { "clip_ratio/high_max": 0.002380200894549489, "clip_ratio/high_mean": 0.0004641784689738415, "clip_ratio/low_mean": 0.006076709658373147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006540888105519116, "entropy": 0.10411721095442772, "epoch": 0.00016, "grad_norm": 0.0034780765417963266, "kl": 0.08655104972422123, "learning_rate": 9.5e-05, "loss": -0.0049, "step": 20, "step_time": 54.08613259694539 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.0052374619990587234, "clip_ratio/low_mean": 7.05718994140625e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005308033898472786, "completions/clipped_ratio": 0.140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14426.0, "completions/mean_length": 2772.7421875, "completions/mean_terminated_length": 545.4454345703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.26159290969371796, "epoch": 0.000168, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0016806161729618907, "kl": 0.13244479056447744, "learning_rate": 0.0001, "loss": -0.0032, "num_tokens": 7608123.0, "reward": 0.30994826555252075, "reward_std": 0.4333060681819916, "rewards/reward_func/mean": 0.30994826555252075, "rewards/reward_func/std": 0.4333060681819916, "sampling/importance_sampling_ratio/max": 2.071073532104492, "sampling/importance_sampling_ratio/mean": 0.8328644633293152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6652194261550903, "sampling/sampling_logp_difference/mean": 0.013313735835254192, "step": 21, "step_time": 408.48256488214247 }, { "clip_ratio/high_max": 0.0009995178115786985, "clip_ratio/high_mean": 0.0001249397264473373, "clip_ratio/low_mean": 0.01684358110651374, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01696852078748634, "entropy": 0.21135510131716728, "epoch": 0.000176, "grad_norm": 0.011167092248797417, "kl": 0.07416732516139746, "learning_rate": 0.0001, "loss": -0.0403, "step": 22, "step_time": 148.2932638968341 }, { "clip_ratio/high_max": 0.0006121824262663722, "clip_ratio/high_mean": 7.652280328329653e-05, "clip_ratio/low_mean": 0.0001096606720238924, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00018618347530718893, "completions/clipped_ratio": 0.171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 16332.0, "completions/mean_length": 3878.53125, "completions/mean_terminated_length": 1283.056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17607736214995384, "epoch": 0.000184, "frac_reward_zero_std": 0.6875, "grad_norm": 0.0007014994043856859, "kl": 0.027259970782324672, "learning_rate": 0.0001, "loss": -0.0, "num_tokens": 8549975.0, "reward": 0.4119563102722168, "reward_std": 0.4680332541465759, "rewards/reward_func/mean": 0.4119563102722168, "rewards/reward_func/std": 0.4680332839488983, "sampling/importance_sampling_ratio/max": 1.453569769859314, "sampling/importance_sampling_ratio/mean": 0.7403019666671753, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4103574752807617, "sampling/sampling_logp_difference/mean": 0.00972401350736618, "step": 23, "step_time": 281.3584946768824 }, { "clip_ratio/high_max": 0.0006908245850354433, "clip_ratio/high_mean": 8.635307312943041e-05, "clip_ratio/low_mean": 0.00011251296746195294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00019886604059138335, "entropy": 0.21833522990345955, "epoch": 0.000192, "grad_norm": 0.0062239160761237144, "kl": 0.06342287547886372, "learning_rate": 0.0001, "loss": 0.0208, "step": 24, "step_time": 83.19132332946174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3324.0, "completions/max_terminated_length": 3324.0, "completions/mean_length": 350.3125, "completions/mean_terminated_length": 350.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21079584956169128, "epoch": 0.0002, "frac_reward_zero_std": 0.5625, "grad_norm": 0.01816585287451744, "kl": 0.047579593025147915, "learning_rate": 0.0001, "loss": -0.0, "num_tokens": 9024447.0, "reward": 0.5932583212852478, "reward_std": 0.4719974398612976, "rewards/reward_func/mean": 0.5932583212852478, "rewards/reward_func/std": 0.4719974100589752, "sampling/importance_sampling_ratio/max": 2.610856533050537, "sampling/importance_sampling_ratio/mean": 0.9387929439544678, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8608701229095459, "sampling/sampling_logp_difference/mean": 0.012685808353126049, "step": 25, "step_time": 86.05315418587998 }, { "clip_ratio/high_max": 0.2500000037252903, "clip_ratio/high_mean": 0.06541509041562676, "clip_ratio/low_mean": 0.0699066836386919, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.13532177731394768, "entropy": 0.16260286793112755, "epoch": 0.000208, "grad_norm": 0.009579629637300968, "kl": 1.1700078528374434, "learning_rate": 0.0001, "loss": -0.0004, "step": 26, "step_time": 34.18811777303927 }, { "clip_ratio/high_max": 0.004500097595155239, "clip_ratio/high_mean": 0.0007490973512176424, "clip_ratio/low_mean": 2.6662485652195755e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0007757598377793329, "completions/clipped_ratio": 0.078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11635.0, "completions/mean_length": 2257.7578125, "completions/mean_terminated_length": 1060.61865234375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.1516659539192915, "epoch": 0.000216, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0051688519306480885, "kl": 0.047932930290699005, "learning_rate": 0.0001, "loss": 0.0004, "num_tokens": 9745576.0, "reward": 0.46375519037246704, "reward_std": 0.48423123359680176, "rewards/reward_func/mean": 0.46375519037246704, "rewards/reward_func/std": 0.48423123359680176, "sampling/importance_sampling_ratio/max": 2.536548376083374, "sampling/importance_sampling_ratio/mean": 0.8151211738586426, "sampling/importance_sampling_ratio/min": 5.257729753793683e-06, "sampling/sampling_logp_difference/max": 6.129981994628906, "sampling/sampling_logp_difference/mean": 0.005175800062716007, "step": 27, "step_time": 436.48245814908296 }, { "clip_ratio/high_max": 0.08695227152202278, "clip_ratio/high_mean": 0.011061059150961228, "clip_ratio/low_mean": 0.010714802792790579, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021775862463982776, "entropy": 0.13691149465739727, "epoch": 0.000224, "grad_norm": 0.024410562589764595, "kl": 0.03237813455052674, "learning_rate": 0.0001, "loss": 0.0973, "step": 28, "step_time": 156.84757056180388 }, { "clip_ratio/high_max": 0.0003620828501880169, "clip_ratio/high_mean": 6.965760076127481e-05, "clip_ratio/low_mean": 0.0001634064483369002, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00023306404909817502, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15615.0, "completions/mean_length": 2270.578125, "completions/mean_terminated_length": 1203.176513671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.25759194791316986, "epoch": 0.000232, "frac_reward_zero_std": 0.4375, "grad_norm": 0.012714912183582783, "kl": 0.10287779942154884, "learning_rate": 0.0001, "loss": -0.0209, "num_tokens": 10600314.0, "reward": 0.4594896733760834, "reward_std": 0.48041772842407227, "rewards/reward_func/mean": 0.4594896733760834, "rewards/reward_func/std": 0.4804176688194275, "sampling/importance_sampling_ratio/max": 2.5663199424743652, "sampling/importance_sampling_ratio/mean": 0.8002752065658569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.7162349224090576, "sampling/sampling_logp_difference/mean": 0.015552837401628494, "step": 29, "step_time": 263.5061617055908 }, { "clip_ratio/high_max": 0.04263468802673742, "clip_ratio/high_mean": 0.005374936816224363, "clip_ratio/low_mean": 0.01608989532542182, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021464832054334693, "entropy": 0.2745072916150093, "epoch": 0.00024, "grad_norm": 0.0038172281347215176, "kl": 0.09374767541885376, "learning_rate": 0.0001, "loss": -0.0059, "step": 30, "step_time": 65.82626755977981 }, { "clip_ratio/high_max": 0.0027790770400315523, "clip_ratio/high_mean": 0.00034738463000394404, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00034738463000394404, "completions/clipped_ratio": 0.0, "completions/max_length": 2434.0, "completions/max_terminated_length": 2434.0, "completions/mean_length": 264.4453125, "completions/mean_terminated_length": 264.4453125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.23240000382065773, "epoch": 0.000248, "frac_reward_zero_std": 0.5625, "grad_norm": 0.018018538132309914, "kl": 0.07072961144149303, "learning_rate": 0.0001, "loss": -0.0044, "num_tokens": 11270435.0, "reward": 0.5666041970252991, "reward_std": 0.49271145462989807, "rewards/reward_func/mean": 0.5666041970252991, "rewards/reward_func/std": 0.4927114248275757, "sampling/importance_sampling_ratio/max": 1.8200021982192993, "sampling/importance_sampling_ratio/mean": 0.97586989402771, "sampling/importance_sampling_ratio/min": 0.1302126795053482, "sampling/sampling_logp_difference/max": 0.7188519239425659, "sampling/sampling_logp_difference/mean": 0.010895353741943836, "step": 31, "step_time": 103.47118871309794 }, { "clip_ratio/high_max": 0.22714198799803853, "clip_ratio/high_mean": 0.06246224191272631, "clip_ratio/low_mean": 0.05428445339202881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.11674669571220875, "entropy": 0.202346783131361, "epoch": 0.000256, "grad_norm": 0.03462144732475281, "kl": 0.26291508600115776, "learning_rate": 0.0001, "loss": -0.0331, "step": 32, "step_time": 44.26117577217519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 16384.0, "completions/max_terminated_length": 1344.0, "completions/mean_length": 3158.328125, "completions/mean_terminated_length": 106.25000762939453, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3173985183238983, "epoch": 0.000264, "frac_reward_zero_std": 0.625, "grad_norm": 0.0015116184949874878, "kl": 0.17226483300328255, "learning_rate": 0.0001, "loss": 0.0, "num_tokens": 12009445.0, "reward": 0.37367647886276245, "reward_std": 0.4661514163017273, "rewards/reward_func/mean": 0.37367647886276245, "rewards/reward_func/std": 0.4661514163017273, "sampling/importance_sampling_ratio/max": 1.266974687576294, "sampling/importance_sampling_ratio/mean": 0.8097731471061707, "sampling/importance_sampling_ratio/min": 5.553430160176731e-09, "sampling/sampling_logp_difference/max": 13.181495666503906, "sampling/sampling_logp_difference/mean": 0.015717996284365654, "step": 33, "step_time": 252.64742845576257 }, { "clip_ratio/high_max": 0.2083333395421505, "clip_ratio/high_mean": 0.026041667442768812, "clip_ratio/low_mean": 0.015625000465661287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04166666744276881, "entropy": 0.3447694480419159, "epoch": 0.000272, "grad_norm": 0.0012050194200128317, "kl": 0.09908118983730674, "learning_rate": 0.0001, "loss": -0.0, "step": 34, "step_time": 52.40133061888628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 10039.0, "completions/max_terminated_length": 10039.0, "completions/mean_length": 676.140625, "completions/mean_terminated_length": 676.140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.36223024874925613, "epoch": 0.00028, "frac_reward_zero_std": 0.625, "grad_norm": 0.006729471497237682, "kl": 0.2994176782667637, "learning_rate": 0.0001, "loss": 0.0002, "num_tokens": 12784223.0, "reward": 0.35155534744262695, "reward_std": 0.46008607745170593, "rewards/reward_func/mean": 0.35155534744262695, "rewards/reward_func/std": 0.46008607745170593, "sampling/importance_sampling_ratio/max": 1.6923160552978516, "sampling/importance_sampling_ratio/mean": 0.9230321645736694, "sampling/importance_sampling_ratio/min": 4.0193415544627353e-13, "sampling/sampling_logp_difference/max": 2.2721805572509766, "sampling/sampling_logp_difference/mean": 0.019083332270383835, "step": 35, "step_time": 156.88474073121324 }, { "clip_ratio/high_max": 0.21071429178118706, "clip_ratio/high_mean": 0.04456845438107848, "clip_ratio/low_mean": 0.042559525929391384, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08712797984480858, "entropy": 0.3039735332131386, "epoch": 0.000288, "grad_norm": 0.002629757858812809, "kl": 1.421083964407444, "learning_rate": 0.0001, "loss": -0.0001, "step": 36, "step_time": 47.79177230759524 }, { "clip_ratio/high_max": 0.00212460938928416, "clip_ratio/high_mean": 0.0004709042623289861, "clip_ratio/low_mean": 0.00013680529809789732, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006077095604268834, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14803.0, "completions/mean_length": 1618.3046875, "completions/mean_terminated_length": 1383.9287109375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.205389566719532, "epoch": 0.000296, "frac_reward_zero_std": 0.375, "grad_norm": 0.034038130193948746, "kl": 0.1238506119698286, "learning_rate": 0.0001, "loss": 0.0496, "num_tokens": 13562182.0, "reward": 0.5307304859161377, "reward_std": 0.4933399260044098, "rewards/reward_func/mean": 0.5307304859161377, "rewards/reward_func/std": 0.4933399558067322, "sampling/importance_sampling_ratio/max": 2.9194023609161377, "sampling/importance_sampling_ratio/mean": 0.8693970441818237, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 6.139076232910156, "sampling/sampling_logp_difference/mean": 0.00930335745215416, "step": 37, "step_time": 304.38122520502657 }, { "clip_ratio/high_max": 0.12984464410692453, "clip_ratio/high_mean": 0.017246030358364806, "clip_ratio/low_mean": 0.04759028274565935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06483631394803524, "entropy": 0.19659215956926346, "epoch": 0.000304, "grad_norm": 0.014286670833826065, "kl": 0.09789336752146482, "learning_rate": 0.0001, "loss": -0.0889, "step": 38, "step_time": 108.92287370702252 }, { "clip_ratio/high_max": 0.002319882420124486, "clip_ratio/high_mean": 0.0004717662050097715, "clip_ratio/low_mean": 4.9786372983362526e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005215525743551552, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14867.0, "completions/mean_length": 2032.5, "completions/mean_terminated_length": 1569.54833984375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19366027787327766, "epoch": 0.000312, "frac_reward_zero_std": 0.5, "grad_norm": 0.02529173344373703, "kl": 0.12496417853981256, "learning_rate": 0.0001, "loss": -0.1314, "num_tokens": 14363022.0, "reward": 0.6012634038925171, "reward_std": 0.46494749188423157, "rewards/reward_func/mean": 0.6012634038925171, "rewards/reward_func/std": 0.46494749188423157, "sampling/importance_sampling_ratio/max": 2.67891263961792, "sampling/importance_sampling_ratio/mean": 0.8825238943099976, "sampling/importance_sampling_ratio/min": 9.072877865667905e-12, "sampling/sampling_logp_difference/max": 2.937361478805542, "sampling/sampling_logp_difference/mean": 0.007425494492053986, "step": 39, "step_time": 242.51036442094482 }, { "clip_ratio/high_max": 0.004297120030969381, "clip_ratio/high_mean": 0.0005371400038711727, "clip_ratio/low_mean": 0.019620355626102537, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020157495309831575, "entropy": 0.2599617578089237, "epoch": 0.00032, "grad_norm": 0.003947969060391188, "kl": 0.17537523806095123, "learning_rate": 0.0001, "loss": -0.0074, "step": 40, "step_time": 64.70451782317832 }, { "clip_ratio/high_max": 0.05252361833117902, "clip_ratio/high_mean": 0.00663674037787132, "clip_ratio/low_mean": 0.0005198562575969845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007156596751883626, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12109.0, "completions/mean_length": 2499.59375, "completions/mean_terminated_length": 516.107177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21654297411441803, "epoch": 0.000328, "frac_reward_zero_std": 0.375, "grad_norm": 0.0020191774237900972, "kl": 0.050669580698013306, "learning_rate": 0.0001, "loss": -0.0037, "num_tokens": 15148602.0, "reward": 0.41135668754577637, "reward_std": 0.4713096022605896, "rewards/reward_func/mean": 0.41135668754577637, "rewards/reward_func/std": 0.4713096022605896, "sampling/importance_sampling_ratio/max": 1.3856815099716187, "sampling/importance_sampling_ratio/mean": 0.8404459953308105, "sampling/importance_sampling_ratio/min": 8.073855711603073e-14, "sampling/sampling_logp_difference/max": 3.5181496143341064, "sampling/sampling_logp_difference/mean": 0.010210744105279446, "step": 41, "step_time": 249.73666059714742 }, { "clip_ratio/high_max": 0.051496061148100125, "clip_ratio/high_mean": 0.00651522628334078, "clip_ratio/low_mean": 0.010954441386274993, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017469667189288884, "entropy": 0.22261176258325577, "epoch": 0.000336, "grad_norm": 0.01327864546328783, "kl": 0.08568831626325846, "learning_rate": 0.0001, "loss": -0.0071, "step": 42, "step_time": 65.51700961985625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2104.0, "completions/max_terminated_length": 2104.0, "completions/mean_length": 337.921875, "completions/mean_terminated_length": 337.921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.18646935559809208, "epoch": 0.000344, "frac_reward_zero_std": 0.8125, "grad_norm": 0.009093403816223145, "kl": 0.04625028697773814, "learning_rate": 0.0001, "loss": -0.0001, "num_tokens": 15697984.0, "reward": 0.5859714150428772, "reward_std": 0.48348718881607056, "rewards/reward_func/mean": 0.5859714150428772, "rewards/reward_func/std": 0.48348718881607056, "sampling/importance_sampling_ratio/max": 2.9582595825195312, "sampling/importance_sampling_ratio/mean": 0.9583175182342529, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8077512979507446, "sampling/sampling_logp_difference/mean": 0.01057741791009903, "step": 43, "step_time": 83.54059210349806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02708333358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02708333358168602, "entropy": 0.1239668894559145, "epoch": 0.000352, "grad_norm": 0.002130000153556466, "kl": 0.7455503353849053, "learning_rate": 0.0001, "loss": 0.0, "step": 44, "step_time": 39.00458105024882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 7439.0, "completions/max_terminated_length": 7439.0, "completions/mean_length": 794.125, "completions/mean_terminated_length": 794.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17747851833701134, "epoch": 0.00036, "frac_reward_zero_std": 0.5625, "grad_norm": 0.009445400908589363, "kl": 0.19956867769360542, "learning_rate": 0.0001, "loss": 0.0001, "num_tokens": 16409168.0, "reward": 0.3401130437850952, "reward_std": 0.44747307896614075, "rewards/reward_func/mean": 0.3401130437850952, "rewards/reward_func/std": 0.44747307896614075, "sampling/importance_sampling_ratio/max": 1.213844895362854, "sampling/importance_sampling_ratio/mean": 0.8736224174499512, "sampling/importance_sampling_ratio/min": 1.1355460628692526e-05, "sampling/sampling_logp_difference/max": 1.4877896308898926, "sampling/sampling_logp_difference/mean": 0.009566227905452251, "step": 45, "step_time": 112.80105081154034 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.005241899751126766, "clip_ratio/low_mean": 0.05211690114811063, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05735880043357611, "entropy": 0.11628733202815056, "epoch": 0.000368, "grad_norm": 0.0010464430088177323, "kl": 0.5155483353883028, "learning_rate": 0.0001, "loss": -0.0001, "step": 46, "step_time": 33.51861782022752 }, { "clip_ratio/high_max": 0.0008075302612269297, "clip_ratio/high_mean": 0.00010094128265336622, "clip_ratio/low_mean": 0.00017487616059952416, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0002758174450718798, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 14041.0, "completions/mean_length": 2758.3125, "completions/mean_terminated_length": 949.5928955078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.20697598904371262, "epoch": 0.000376, "frac_reward_zero_std": 0.3125, "grad_norm": 0.008048626594245434, "kl": 0.31821640580892563, "learning_rate": 0.0001, "loss": 0.0341, "num_tokens": 17250760.0, "reward": 0.4497794508934021, "reward_std": 0.47367680072784424, "rewards/reward_func/mean": 0.4497794508934021, "rewards/reward_func/std": 0.4736768305301666, "sampling/importance_sampling_ratio/max": 1.4071918725967407, "sampling/importance_sampling_ratio/mean": 0.7693284749984741, "sampling/importance_sampling_ratio/min": 6.366646576258517e-14, "sampling/sampling_logp_difference/max": 2.0170488357543945, "sampling/sampling_logp_difference/mean": 0.014924651943147182, "step": 47, "step_time": 265.7373479530215 }, { "clip_ratio/high_max": 0.18790940550388768, "clip_ratio/high_mean": 0.04044250077276956, "clip_ratio/low_mean": 0.023039879743009806, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06348238191276323, "entropy": 0.20131932944059372, "epoch": 0.000384, "grad_norm": 0.002510676858946681, "kl": 0.3020637482404709, "learning_rate": 0.0001, "loss": 0.0018, "step": 48, "step_time": 63.53406945313327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 2.0148290786892176e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 2.0148290786892176e-05, "completions/clipped_ratio": 0.0, "completions/max_length": 1549.0, "completions/max_terminated_length": 1549.0, "completions/mean_length": 194.90625, "completions/mean_terminated_length": 194.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16713403165340424, "epoch": 0.000392, "frac_reward_zero_std": 0.5, "grad_norm": 0.03661969304084778, "kl": 0.27568595856428146, "learning_rate": 0.0001, "loss": 0.1756, "num_tokens": 17609516.0, "reward": 0.3761705458164215, "reward_std": 0.4686991274356842, "rewards/reward_func/mean": 0.3761705458164215, "rewards/reward_func/std": 0.4686991274356842, "sampling/importance_sampling_ratio/max": 2.2934863567352295, "sampling/importance_sampling_ratio/mean": 1.0030004978179932, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.8843307495117188, "sampling/sampling_logp_difference/mean": 0.011824723333120346, "step": 49, "step_time": 33.989881575806066 }, { "clip_ratio/high_max": 0.18333333730697632, "clip_ratio/high_mean": 0.03967476915568113, "clip_ratio/low_mean": 0.06265822611749172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.10233299620449543, "entropy": 0.11747358739376068, "epoch": 0.0004, "grad_norm": 520.3370971679688, "kl": 138366.12171524763, "learning_rate": 0.0001, "loss": 0.9764, "step": 50, "step_time": 11.585765323834494 }, { "clip_ratio/high_max": 0.001699419430224225, "clip_ratio/high_mean": 0.00021242742877802812, "clip_ratio/low_mean": 8.518956747138873e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00029761699261143804, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14391.0, "completions/mean_length": 1964.4296875, "completions/mean_terminated_length": 1499.2822265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19312381371855736, "epoch": 0.000408, "frac_reward_zero_std": 0.375, "grad_norm": 0.004857824184000492, "kl": 0.11069730296730995, "learning_rate": 0.0001, "loss": -0.0, "num_tokens": 18231323.0, "reward": 0.5406150221824646, "reward_std": 0.474507600069046, "rewards/reward_func/mean": 0.5406150221824646, "rewards/reward_func/std": 0.474507600069046, "sampling/importance_sampling_ratio/max": 1.754118800163269, "sampling/importance_sampling_ratio/mean": 0.8371249437332153, "sampling/importance_sampling_ratio/min": 2.1961432238731815e-12, "sampling/sampling_logp_difference/max": 1.711458444595337, "sampling/sampling_logp_difference/mean": 0.011138837784528732, "step": 51, "step_time": 234.7052824080456 }, { "clip_ratio/high_max": 0.052048374724108726, "clip_ratio/high_mean": 0.006506046840513591, "clip_ratio/low_mean": 0.0005927347665419802, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007098781847162172, "entropy": 0.18790747597813606, "epoch": 0.000416, "grad_norm": 0.02879762463271618, "kl": 0.14645008742809296, "learning_rate": 0.0001, "loss": -0.0793, "step": 52, "step_time": 67.58970385813154 }, { "clip_ratio/high_max": 0.0011903044069185853, "clip_ratio/high_mean": 0.00014878805086482316, "clip_ratio/low_mean": 0.00016782619059085846, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003166142414556816, "completions/clipped_ratio": 0.015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14571.0, "completions/mean_length": 1242.5234375, "completions/mean_terminated_length": 1002.1826171875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2562270388007164, "epoch": 0.000424, "frac_reward_zero_std": 0.4375, "grad_norm": 0.003293583169579506, "kl": 0.14673679322004318, "learning_rate": 0.0001, "loss": -0.0, "num_tokens": 18904638.0, "reward": 0.515934944152832, "reward_std": 0.4869852364063263, "rewards/reward_func/mean": 0.515934944152832, "rewards/reward_func/std": 0.4869852364063263, "sampling/importance_sampling_ratio/max": 2.125849485397339, "sampling/importance_sampling_ratio/mean": 0.8796348571777344, "sampling/importance_sampling_ratio/min": 1.6146490811053127e-09, "sampling/sampling_logp_difference/max": 2.236321210861206, "sampling/sampling_logp_difference/mean": 0.00907333567738533, "step": 53, "step_time": 403.6396517488174 }, { "clip_ratio/high_max": 0.004773067426867783, "clip_ratio/high_mean": 0.0005966334283584729, "clip_ratio/low_mean": 9.633062768443779e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006929640658199787, "entropy": 0.16939815878868103, "epoch": 0.000432, "grad_norm": 0.005178892519325018, "kl": 0.12158099561929703, "learning_rate": 0.0001, "loss": -0.0092, "step": 54, "step_time": 155.7508629639633 }, { "clip_ratio/high_max": 0.00413007679162547, "clip_ratio/high_mean": 0.0005162595989531837, "clip_ratio/low_mean": 0.00044288246863288805, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.000959142082137987, "completions/clipped_ratio": 0.1640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15599.0, "completions/mean_length": 4115.78125, "completions/mean_terminated_length": 1708.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.14124679006636143, "epoch": 0.00044, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0008667530491948128, "kl": 0.15955708548426628, "learning_rate": 0.0001, "loss": 0.0005, "num_tokens": 19689170.0, "reward": 0.4943884015083313, "reward_std": 0.4778369963169098, "rewards/reward_func/mean": 0.4943884015083313, "rewards/reward_func/std": 0.4778369963169098, "sampling/importance_sampling_ratio/max": 1.5920473337173462, "sampling/importance_sampling_ratio/mean": 0.6500886082649231, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.026095390319824, "sampling/sampling_logp_difference/mean": 0.006507984362542629, "step": 55, "step_time": 295.36478371801786 }, { "clip_ratio/high_max": 0.006438895943574607, "clip_ratio/high_mean": 0.0010595864005153999, "clip_ratio/low_mean": 0.0001230314956046641, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0011826178670162335, "entropy": 0.16170401498675346, "epoch": 0.000448, "grad_norm": 0.0010976437479257584, "kl": 0.14007344283163548, "learning_rate": 0.0001, "loss": -0.0014, "step": 56, "step_time": 87.99333154270425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0005311340792104602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0005311340792104602, "completions/clipped_ratio": 0.1171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 10352.0, "completions/mean_length": 2540.15625, "completions/mean_terminated_length": 702.4778442382812, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.09304443560540676, "epoch": 0.000456, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0008479373645968735, "kl": 0.06781758088618517, "learning_rate": 0.0001, "loss": 0.0, "num_tokens": 20376822.0, "reward": 0.5963060855865479, "reward_std": 0.4900885820388794, "rewards/reward_func/mean": 0.5963060855865479, "rewards/reward_func/std": 0.4900885820388794, "sampling/importance_sampling_ratio/max": 1.911750316619873, "sampling/importance_sampling_ratio/mean": 0.787479043006897, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.4092633724212646, "sampling/sampling_logp_difference/mean": 0.004730356857180595, "step": 57, "step_time": 275.88833857746795 }, { "clip_ratio/high_max": 0.0007761355664115399, "clip_ratio/high_mean": 9.701694580144249e-05, "clip_ratio/low_mean": 0.0005300462580635212, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006270632075029425, "entropy": 0.07158766873180866, "epoch": 0.000464, "grad_norm": 0.0023808805271983147, "kl": 0.058913652785122395, "learning_rate": 0.0001, "loss": -0.0026, "step": 58, "step_time": 78.15128924208693 }, { "clip_ratio/high_max": 0.0045549869537353516, "clip_ratio/high_mean": 0.0005933154025115073, "clip_ratio/low_mean": 0.0014756222371943295, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020689376979134977, "completions/clipped_ratio": 0.1015625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14099.0, "completions/mean_length": 2205.8359375, "completions/mean_terminated_length": 603.0869140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12633545510470867, "epoch": 0.000472, "frac_reward_zero_std": 0.5, "grad_norm": 0.0027347116265445948, "kl": 0.06752120889723301, "learning_rate": 0.0001, "loss": -0.0058, "num_tokens": 20882249.0, "reward": 0.37335968017578125, "reward_std": 0.46674516797065735, "rewards/reward_func/mean": 0.37335968017578125, "rewards/reward_func/std": 0.46674516797065735, "sampling/importance_sampling_ratio/max": 1.1944468021392822, "sampling/importance_sampling_ratio/mean": 0.8154112100601196, "sampling/importance_sampling_ratio/min": 3.1055763429627126e-12, "sampling/sampling_logp_difference/max": 2.516141891479492, "sampling/sampling_logp_difference/mean": 0.005523150786757469, "step": 59, "step_time": 243.8482750041876 }, { "clip_ratio/high_max": 0.0070169707760214806, "clip_ratio/high_mean": 0.0008790283463895321, "clip_ratio/low_mean": 0.011722586234100163, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012601614958839491, "entropy": 0.19912216998636723, "epoch": 0.00048, "grad_norm": 0.004964805673807859, "kl": 0.3173718089237809, "learning_rate": 0.0001, "loss": -0.0111, "step": 60, "step_time": 67.48408747394569 }, { "clip_ratio/high_max": 0.003293595160357654, "clip_ratio/high_mean": 0.00041169939504470676, "clip_ratio/low_mean": 0.001325315679423511, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.001737015089020133, "completions/clipped_ratio": 0.140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 15692.0, "completions/mean_length": 3332.625, "completions/mean_terminated_length": 1196.9454345703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.20970237255096436, "epoch": 0.000488, "frac_reward_zero_std": 0.1875, "grad_norm": 0.0021050143986940384, "kl": 0.22093774378299713, "learning_rate": 0.0001, "loss": 0.0007, "num_tokens": 21614089.0, "reward": 0.39702850580215454, "reward_std": 0.48266974091529846, "rewards/reward_func/mean": 0.39702850580215454, "rewards/reward_func/std": 0.48266977071762085, "sampling/importance_sampling_ratio/max": 1.2085174322128296, "sampling/importance_sampling_ratio/mean": 0.7436270713806152, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.35117506980896, "sampling/sampling_logp_difference/mean": 0.009538266807794571, "step": 61, "step_time": 356.0667571427766 }, { "clip_ratio/high_max": 0.0040483163320459425, "clip_ratio/high_mean": 0.0008040911634452641, "clip_ratio/low_mean": 0.000619270489551127, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0014233616821002215, "entropy": 0.2903680093586445, "epoch": 0.000496, "grad_norm": 0.0013163138646632433, "kl": 0.22006989642977715, "learning_rate": 0.0001, "loss": 0.0, "step": 62, "step_time": 120.53421806404367 }, { "clip_ratio/high_max": 0.04629576357547194, "clip_ratio/high_mean": 0.006143865539343096, "clip_ratio/low_mean": 0.00034697772935032845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006490843268693425, "completions/clipped_ratio": 0.1875, "completions/max_length": 16384.0, "completions/max_terminated_length": 5936.0, "completions/mean_length": 3316.71875, "completions/mean_terminated_length": 301.19232177734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11050131916999817, "epoch": 0.000504, "frac_reward_zero_std": 0.375, "grad_norm": 0.011926773004233837, "kl": 0.7313324622809887, "learning_rate": 0.0001, "loss": 0.0115, "num_tokens": 22353821.0, "reward": 0.326221227645874, "reward_std": 0.46681538224220276, "rewards/reward_func/mean": 0.326221227645874, "rewards/reward_func/std": 0.46681535243988037, "sampling/importance_sampling_ratio/max": 1.7655757665634155, "sampling/importance_sampling_ratio/mean": 0.7426654100418091, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.8568098545074463, "sampling/sampling_logp_difference/mean": 0.006891004741191864, "step": 63, "step_time": 302.78563037491404 }, { "clip_ratio/high_max": 0.04654776549432427, "clip_ratio/high_mean": 0.0064325097628170624, "clip_ratio/low_mean": 0.0002609335570014082, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006693443312542513, "entropy": 0.2221522331237793, "epoch": 0.000512, "grad_norm": 0.00616535684093833, "kl": 0.31134266406297684, "learning_rate": 0.0001, "loss": -0.0099, "step": 64, "step_time": 98.32436606986448 }, { "clip_ratio/high_max": 0.002486778888851404, "clip_ratio/high_mean": 0.0003108473611064255, "clip_ratio/low_mean": 0.004113847695407458, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004424695056513883, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 12115.0, "completions/mean_length": 1701.78125, "completions/mean_terminated_length": 591.3613891601562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.21395106986165047, "epoch": 0.00052, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0024383016861975193, "kl": 0.6399696841835976, "learning_rate": 0.0001, "loss": -0.0004, "num_tokens": 23079545.0, "reward": 0.5035587549209595, "reward_std": 0.49234020709991455, "rewards/reward_func/mean": 0.5035587549209595, "rewards/reward_func/std": 0.49234020709991455, "sampling/importance_sampling_ratio/max": 2.8838155269622803, "sampling/importance_sampling_ratio/mean": 0.9122731685638428, "sampling/importance_sampling_ratio/min": 1.5084187154554285e-12, "sampling/sampling_logp_difference/max": 1.4747650623321533, "sampling/sampling_logp_difference/mean": 0.008541534654796124, "step": 65, "step_time": 420.5027240368072 }, { "clip_ratio/high_max": 0.006610034382902086, "clip_ratio/high_mean": 0.0008262542978627607, "clip_ratio/low_mean": 0.025520833674818277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026347088234615512, "entropy": 0.21546945348381996, "epoch": 0.000528, "grad_norm": 0.002588092116639018, "kl": 0.3157913535833359, "learning_rate": 0.0001, "loss": -0.0018, "step": 66, "step_time": 150.25083671603352 }, { "clip_ratio/high_max": 0.00305051077157259, "clip_ratio/high_mean": 0.0004194560169707984, "clip_ratio/low_mean": 0.0010861777554964647, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015056337579153478, "completions/clipped_ratio": 0.03125, "completions/max_length": 16384.0, "completions/max_terminated_length": 11704.0, "completions/mean_length": 1241.578125, "completions/mean_terminated_length": 753.1128540039062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.34802427887916565, "epoch": 0.000536, "frac_reward_zero_std": 0.1875, "grad_norm": 0.010660664178431034, "kl": 0.3574690632522106, "learning_rate": 0.0001, "loss": 0.0188, "num_tokens": 23671939.0, "reward": 0.30863311886787415, "reward_std": 0.4487622380256653, "rewards/reward_func/mean": 0.30863311886787415, "rewards/reward_func/std": 0.4487622380256653, "sampling/importance_sampling_ratio/max": 1.2251367568969727, "sampling/importance_sampling_ratio/mean": 0.8774986267089844, "sampling/importance_sampling_ratio/min": 5.643987203594533e-15, "sampling/sampling_logp_difference/max": 2.2548747062683105, "sampling/sampling_logp_difference/mean": 0.01900642365217209, "step": 67, "step_time": 272.54457034613006 }, { "clip_ratio/high_max": 0.04360994976013899, "clip_ratio/high_mean": 0.0056244394509121776, "clip_ratio/low_mean": 0.019384152255952358, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025008591823279858, "entropy": 0.30202219262719154, "epoch": 0.000544, "grad_norm": 0.0031096329912543297, "kl": 0.19528233632445335, "learning_rate": 0.0001, "loss": -0.0002, "step": 68, "step_time": 87.20590779092163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 7.0, "completions/mean_length": 1026.96875, "completions/mean_terminated_length": 3.1666667461395264, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.360334113240242, "epoch": 0.000552, "frac_reward_zero_std": 0.3125, "grad_norm": 0.004637387115508318, "kl": 0.21597419865429401, "learning_rate": 0.0001, "loss": 0.0001, "num_tokens": 24378991.0, "reward": 0.2120947688817978, "reward_std": 0.3439648747444153, "rewards/reward_func/mean": 0.2120947688817978, "rewards/reward_func/std": 0.3439648747444153, "sampling/importance_sampling_ratio/max": 1.2173486948013306, "sampling/importance_sampling_ratio/mean": 0.9408060908317566, "sampling/importance_sampling_ratio/min": 6.941108278424313e-12, "sampling/sampling_logp_difference/max": 2.992739677429199, "sampling/sampling_logp_difference/mean": 0.01907212659716606, "step": 69, "step_time": 413.641770795919 }, { "clip_ratio/high_max": 0.22500000521540642, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.01145833358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04479166818782687, "entropy": 0.43712426722049713, "epoch": 0.00056, "grad_norm": 0.0043171476572752, "kl": 0.2573888264596462, "learning_rate": 0.0001, "loss": -0.0001, "step": 70, "step_time": 149.37348693376407 }, { "clip_ratio/high_max": 0.011275041149929166, "clip_ratio/high_mean": 0.0016421006293967366, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0016421006293967366, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15718.0, "completions/mean_length": 1101.6171875, "completions/mean_terminated_length": 981.283447265625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24557911232113838, "epoch": 0.000568, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0067433081567287445, "kl": 0.1323122438043356, "learning_rate": 0.0001, "loss": -0.0056, "num_tokens": 24892926.0, "reward": 0.5961877107620239, "reward_std": 0.48414939641952515, "rewards/reward_func/mean": 0.5961877107620239, "rewards/reward_func/std": 0.48414939641952515, "sampling/importance_sampling_ratio/max": 2.302067279815674, "sampling/importance_sampling_ratio/mean": 0.8856508731842041, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.4530627727508545, "sampling/sampling_logp_difference/mean": 0.014587011188268661, "step": 71, "step_time": 211.82522862195037 }, { "clip_ratio/high_max": 0.05322292904020287, "clip_ratio/high_mean": 0.007361717482126551, "clip_ratio/low_mean": 0.0005052025571785634, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007866920397646027, "entropy": 0.24957521632313728, "epoch": 0.000576, "grad_norm": 0.03510262817144394, "kl": 0.10960768908262253, "learning_rate": 0.0001, "loss": 0.0024, "step": 72, "step_time": 55.394755602115765 }, { "clip_ratio/high_max": 0.04315747210057452, "clip_ratio/high_mean": 0.005394684012571815, "clip_ratio/low_mean": 0.01105505934174289, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01644974334340077, "completions/clipped_ratio": 0.0390625, "completions/max_length": 16384.0, "completions/max_terminated_length": 14039.0, "completions/mean_length": 1335.3359375, "completions/mean_terminated_length": 723.6016235351562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.24440882354974747, "epoch": 0.000584, "frac_reward_zero_std": 0.5, "grad_norm": 0.003040608251467347, "kl": 0.3741700351238251, "learning_rate": 0.0001, "loss": 0.0027, "num_tokens": 25583089.0, "reward": 0.5570250153541565, "reward_std": 0.4779793322086334, "rewards/reward_func/mean": 0.5570250153541565, "rewards/reward_func/std": 0.4779793322086334, "sampling/importance_sampling_ratio/max": 1.250884771347046, "sampling/importance_sampling_ratio/mean": 0.888532280921936, "sampling/importance_sampling_ratio/min": 2.9074090690528465e-08, "sampling/sampling_logp_difference/max": 1.549929141998291, "sampling/sampling_logp_difference/mean": 0.010362871922552586, "step": 73, "step_time": 457.1430780822411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01055803267081501, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01055803267081501, "entropy": 0.20154350250959396, "epoch": 0.000592, "grad_norm": 0.002142983488738537, "kl": 0.14010655879974365, "learning_rate": 0.0001, "loss": 0.0032, "step": 74, "step_time": 180.0332786256913 }, { "clip_ratio/high_max": 0.003929472557501867, "clip_ratio/high_mean": 0.000681739784340607, "clip_ratio/low_mean": 0.000918249599635601, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0015999893948901445, "completions/clipped_ratio": 0.171875, "completions/max_length": 16384.0, "completions/max_terminated_length": 13287.0, "completions/mean_length": 3697.3046875, "completions/mean_terminated_length": 1064.217041015625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.22590620815753937, "epoch": 0.0006, "frac_reward_zero_std": 0.25, "grad_norm": 0.003586079925298691, "kl": 0.14618558436632156, "learning_rate": 0.0001, "loss": -0.0088, "num_tokens": 26577264.0, "reward": 0.3558464050292969, "reward_std": 0.4764367341995239, "rewards/reward_func/mean": 0.3558464050292969, "rewards/reward_func/std": 0.4764367640018463, "sampling/importance_sampling_ratio/max": 1.2136788368225098, "sampling/importance_sampling_ratio/mean": 0.7275122404098511, "sampling/importance_sampling_ratio/min": 1.4362665263063827e-19, "sampling/sampling_logp_difference/max": 1.6977732181549072, "sampling/sampling_logp_difference/mean": 0.013559934683144093, "step": 75, "step_time": 279.72021683468483 }, { "clip_ratio/high_max": 0.03983482558396645, "clip_ratio/high_mean": 0.005230471204413334, "clip_ratio/low_mean": 0.004741664102766663, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009972134721465409, "entropy": 0.20037926360964775, "epoch": 0.000608, "grad_norm": 0.0038128597661852837, "kl": 0.12694548070430756, "learning_rate": 0.0001, "loss": -0.0052, "step": 76, "step_time": 75.0094793732278 }, { "clip_ratio/high_max": 0.0030184224306140095, "clip_ratio/high_mean": 0.0003773028038267512, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003773028038267512, "completions/clipped_ratio": 0.140625, "completions/max_length": 16384.0, "completions/max_terminated_length": 13004.0, "completions/mean_length": 2918.640625, "completions/mean_terminated_length": 715.2181396484375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2610139548778534, "epoch": 0.000616, "frac_reward_zero_std": 0.5, "grad_norm": 0.0017114380607381463, "kl": 0.35155298560857773, "learning_rate": 0.0001, "loss": -0.0019, "num_tokens": 27487122.0, "reward": 0.37231287360191345, "reward_std": 0.451972633600235, "rewards/reward_func/mean": 0.37231287360191345, "rewards/reward_func/std": 0.451972633600235, "sampling/importance_sampling_ratio/max": 1.72040593624115, "sampling/importance_sampling_ratio/mean": 0.8099797964096069, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.51240873336792, "sampling/sampling_logp_difference/mean": 0.01623906008899212, "step": 77, "step_time": 270.05139491008595 }, { "clip_ratio/high_max": 0.04338416282553226, "clip_ratio/high_mean": 0.00988730626704637, "clip_ratio/low_mean": 0.014765097017516382, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024652403328218497, "entropy": 0.31988539546728134, "epoch": 0.000624, "grad_norm": 0.01336484681814909, "kl": 0.2897513546049595, "learning_rate": 0.0001, "loss": -0.0337, "step": 78, "step_time": 79.79506105207838 }, { "clip_ratio/high_max": 0.05000000074505806, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "completions/clipped_ratio": 0.125, "completions/max_length": 16384.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 2130.7109375, "completions/mean_terminated_length": 94.52678680419922, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.2616175599396229, "epoch": 0.000632, "frac_reward_zero_std": 0.3125, "grad_norm": 0.003293546847999096, "kl": 0.34126188047230244, "learning_rate": 0.0001, "loss": -0.0058, "num_tokens": 28247557.0, "reward": 0.42832934856414795, "reward_std": 0.45818737149238586, "rewards/reward_func/mean": 0.42832934856414795, "rewards/reward_func/std": 0.45818737149238586, "sampling/importance_sampling_ratio/max": 2.1360301971435547, "sampling/importance_sampling_ratio/mean": 0.9051436185836792, "sampling/importance_sampling_ratio/min": 2.526717501893927e-09, "sampling/sampling_logp_difference/max": 2.0171802043914795, "sampling/sampling_logp_difference/mean": 0.017018210142850876, "step": 79, "step_time": 285.03405929682776 }, { "clip_ratio/high_max": 0.22500000521540642, "clip_ratio/high_mean": 0.03081387374550104, "clip_ratio/low_mean": 0.01876397612886649, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04957784991711378, "entropy": 0.2775324620306492, "epoch": 0.00064, "grad_norm": 0.003191626165062189, "kl": 0.21329555287957191, "learning_rate": 0.0001, "loss": 0.0083, "step": 80, "step_time": 82.6638121791184 }, { "clip_ratio/high_max": 0.0008650519303046167, "clip_ratio/high_mean": 0.00010813149128807709, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00010813149128807709, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 1379.0, "completions/mean_length": 1149.046875, "completions/mean_terminated_length": 133.3833465576172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17147246748209, "epoch": 0.000648, "frac_reward_zero_std": 0.5, "grad_norm": 0.0028895766008645296, "kl": 0.09206334501504898, "learning_rate": 0.0001, "loss": 0.0, "num_tokens": 28968019.0, "reward": 0.5178102254867554, "reward_std": 0.47222229838371277, "rewards/reward_func/mean": 0.5178102254867554, "rewards/reward_func/std": 0.47222229838371277, "sampling/importance_sampling_ratio/max": 1.396835446357727, "sampling/importance_sampling_ratio/mean": 0.9524275064468384, "sampling/importance_sampling_ratio/min": 1.0122628737008199e-05, "sampling/sampling_logp_difference/max": 1.3480243682861328, "sampling/sampling_logp_difference/mean": 0.026343410834670067, "step": 81, "step_time": 286.13709013699554 }, { "clip_ratio/high_max": 0.09256594924954697, "clip_ratio/high_mean": 0.021243362592940684, "clip_ratio/low_mean": 0.01618036488071084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037423727568238974, "entropy": 0.19774584844708443, "epoch": 0.000656, "grad_norm": 0.003909197635948658, "kl": 0.20639685168862343, "learning_rate": 0.0001, "loss": -0.0001, "step": 82, "step_time": 81.07379846903495 }, { "clip_ratio/high_max": 0.0021464216988533735, "clip_ratio/high_mean": 0.0002683027123566717, "clip_ratio/low_mean": 0.005357950474717654, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0056262532161781564, "completions/clipped_ratio": 0.0, "completions/max_length": 3288.0, "completions/max_terminated_length": 3288.0, "completions/mean_length": 196.8671875, "completions/mean_terminated_length": 196.8671875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3457997739315033, "epoch": 0.000664, "frac_reward_zero_std": 0.3125, "grad_norm": 0.022016355767846107, "kl": 0.4088924489915371, "learning_rate": 0.0001, "loss": -0.007, "num_tokens": 29420586.0, "reward": 0.37082305550575256, "reward_std": 0.46140459179878235, "rewards/reward_func/mean": 0.37082305550575256, "rewards/reward_func/std": 0.46140459179878235, "sampling/importance_sampling_ratio/max": 1.4103598594665527, "sampling/importance_sampling_ratio/mean": 0.9493687748908997, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.0998687744140625, "sampling/sampling_logp_difference/mean": 0.023627880960702896, "step": 83, "step_time": 57.941960010211915 }, { "clip_ratio/high_max": 0.1273603499867022, "clip_ratio/high_mean": 0.016455542587209493, "clip_ratio/low_mean": 0.06458333507180214, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.08103887923061848, "entropy": 0.282495453953743, "epoch": 0.000672, "grad_norm": 0.021933559328317642, "kl": 0.2802862487733364, "learning_rate": 0.0001, "loss": -0.0041, "step": 84, "step_time": 18.895374842220917 }, { "clip_ratio/high_max": 0.0016916769818635657, "clip_ratio/high_mean": 0.00021145962273294572, "clip_ratio/low_mean": 0.0004629129107343033, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0006743725316482596, "completions/clipped_ratio": 0.0625, "completions/max_length": 16384.0, "completions/max_terminated_length": 8302.0, "completions/mean_length": 1417.296875, "completions/mean_terminated_length": 419.5166931152344, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19242028519511223, "epoch": 0.00068, "frac_reward_zero_std": 0.25, "grad_norm": 0.004599843639880419, "kl": 0.10945684090256691, "learning_rate": 0.0001, "loss": -0.0015, "num_tokens": 30223584.0, "reward": 0.38827669620513916, "reward_std": 0.4383874535560608, "rewards/reward_func/mean": 0.38827669620513916, "rewards/reward_func/std": 0.4383874237537384, "sampling/importance_sampling_ratio/max": 1.2852191925048828, "sampling/importance_sampling_ratio/mean": 0.8869220018386841, "sampling/importance_sampling_ratio/min": 6.8003160436092e-08, "sampling/sampling_logp_difference/max": 3.0428004264831543, "sampling/sampling_logp_difference/mean": 0.011553528718650341, "step": 85, "step_time": 401.25821340084076 }, { "clip_ratio/high_max": 0.08453038916923106, "clip_ratio/high_mean": 0.01064000147744082, "clip_ratio/low_mean": 0.041717116328072734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.052357119115185924, "entropy": 0.28386014327406883, "epoch": 0.000688, "grad_norm": 0.008475115522742271, "kl": 0.24625534750521183, "learning_rate": 0.0001, "loss": 0.0125, "step": 86, "step_time": 140.54234700393863 }, { "clip_ratio/high_max": 0.025748229207238182, "clip_ratio/high_mean": 0.0034044762833218556, "clip_ratio/low_mean": 0.0008440050805802457, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00424848121474497, "completions/clipped_ratio": 0.0859375, "completions/max_length": 16384.0, "completions/max_terminated_length": 14449.0, "completions/mean_length": 2038.8046875, "completions/mean_terminated_length": 690.1111450195312, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.17366936802864075, "epoch": 0.000696, "frac_reward_zero_std": 0.375, "grad_norm": 0.004945417400449514, "kl": 0.21483153477311134, "learning_rate": 0.0001, "loss": -0.0155, "num_tokens": 30996463.0, "reward": 0.569595456123352, "reward_std": 0.4689222276210785, "rewards/reward_func/mean": 0.569595456123352, "rewards/reward_func/std": 0.4689222574234009, "sampling/importance_sampling_ratio/max": 1.5022563934326172, "sampling/importance_sampling_ratio/mean": 0.8090132474899292, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.836258888244629, "sampling/sampling_logp_difference/mean": 0.012665043585002422, "step": 87, "step_time": 420.3839778539259 }, { "clip_ratio/high_max": 0.07500000111758709, "clip_ratio/high_mean": 0.009656705195084214, "clip_ratio/low_mean": 0.021538077868171968, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03119478444568813, "entropy": 0.14676123298704624, "epoch": 0.000704, "grad_norm": 0.0015365808503702283, "kl": 0.2069963738322258, "learning_rate": 0.0001, "loss": 0.0006, "step": 88, "step_time": 159.39437931077555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 249.2734375, "completions/mean_terminated_length": 249.2734375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.19034305587410927, "epoch": 0.000712, "frac_reward_zero_std": 0.5625, "grad_norm": 0.013274043798446655, "kl": 0.10685652680695057, "learning_rate": 0.0001, "loss": 0.0003, "num_tokens": 31367970.0, "reward": 0.5855048894882202, "reward_std": 0.4757256805896759, "rewards/reward_func/mean": 0.5855048894882202, "rewards/reward_func/std": 0.4757256805896759, "sampling/importance_sampling_ratio/max": 2.164741277694702, "sampling/importance_sampling_ratio/mean": 0.9093552827835083, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 0.9638514518737793, "sampling/sampling_logp_difference/mean": 0.010461562313139439, "step": 89, "step_time": 45.6294292754028 }, { "clip_ratio/high_max": 0.1666666716337204, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.02095832316626911, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05220832256600261, "entropy": 0.16832438856363297, "epoch": 0.00072, "grad_norm": 0.013677907176315784, "kl": 0.21795203164219856, "learning_rate": 0.0001, "loss": -0.0003, "step": 90, "step_time": 14.39668608084321 }, { "clip_ratio/high_max": 0.0016864245990291238, "clip_ratio/high_mean": 0.00021080307487864047, "clip_ratio/low_mean": 9.959549061022699e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00031039856548886746, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 14607.0, "completions/mean_length": 2265.78125, "completions/mean_terminated_length": 1198.016845703125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3109714537858963, "epoch": 0.000728, "frac_reward_zero_std": 0.5625, "grad_norm": 0.0026221030857414007, "kl": 0.29385947436094284, "learning_rate": 0.0001, "loss": -0.0016, "num_tokens": 31978134.0, "reward": 0.41071587800979614, "reward_std": 0.48560255765914917, "rewards/reward_func/mean": 0.41071587800979614, "rewards/reward_func/std": 0.48560255765914917, "sampling/importance_sampling_ratio/max": 2.1744186878204346, "sampling/importance_sampling_ratio/mean": 0.7339906692504883, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.330114483833313, "sampling/sampling_logp_difference/mean": 0.01747949793934822, "step": 91, "step_time": 224.5670603781473 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.00536864111199975, "clip_ratio/low_mean": 0.01056993727979716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01593857839179691, "entropy": 0.3097205422818661, "epoch": 0.000736, "grad_norm": 0.0013873938005417585, "kl": 0.21971550025045872, "learning_rate": 0.0001, "loss": 0.0, "step": 92, "step_time": 41.30870744702406 }, { "clip_ratio/high_max": 0.0018048831261694431, "clip_ratio/high_mean": 0.00023677908757235855, "clip_ratio/low_mean": 0.021204495129495626, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021441274860990234, "completions/clipped_ratio": 0.1796875, "completions/max_length": 16384.0, "completions/max_terminated_length": 8798.0, "completions/mean_length": 3474.4140625, "completions/mean_terminated_length": 646.6000366210938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.16217100247740746, "epoch": 0.000744, "frac_reward_zero_std": 0.375, "grad_norm": 0.00539315864443779, "kl": 0.22048377990722656, "learning_rate": 0.0001, "loss": 0.0072, "num_tokens": 33019531.0, "reward": 0.2454037368297577, "reward_std": 0.39766862988471985, "rewards/reward_func/mean": 0.2454037368297577, "rewards/reward_func/std": 0.3976685702800751, "sampling/importance_sampling_ratio/max": 2.8269150257110596, "sampling/importance_sampling_ratio/mean": 0.7975430488586426, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.909695863723755, "sampling/sampling_logp_difference/mean": 0.007487665396183729, "step": 93, "step_time": 474.8481697048992 }, { "clip_ratio/high_max": 0.04168192390443437, "clip_ratio/high_mean": 0.005210240488054296, "clip_ratio/low_mean": 0.0055653811286902055, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01077562254795339, "entropy": 0.15040505304932594, "epoch": 0.000752, "grad_norm": 0.008554578758776188, "kl": 0.21248403005301952, "learning_rate": 0.0001, "loss": 0.0569, "step": 94, "step_time": 175.67941991216503 }, { "clip_ratio/high_max": 0.004689611494541168, "clip_ratio/high_mean": 0.0007652845233678818, "clip_ratio/low_mean": 0.00020595593377947807, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0009712404571473598, "completions/clipped_ratio": 0.0703125, "completions/max_length": 16384.0, "completions/max_terminated_length": 15830.0, "completions/mean_length": 1703.2734375, "completions/mean_terminated_length": 592.9664306640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.11936141178011894, "epoch": 0.00076, "frac_reward_zero_std": 0.5625, "grad_norm": 0.007037501782178879, "kl": 0.15314552932977676, "learning_rate": 0.0001, "loss": 0.0121, "num_tokens": 33616110.0, "reward": 0.5862494707107544, "reward_std": 0.4870387017726898, "rewards/reward_func/mean": 0.5862494707107544, "rewards/reward_func/std": 0.4870387017726898, "sampling/importance_sampling_ratio/max": 1.921995997428894, "sampling/importance_sampling_ratio/mean": 0.8520303964614868, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.492267608642578, "sampling/sampling_logp_difference/mean": 0.0060460735112428665, "step": 95, "step_time": 279.93328415811993 }, { "clip_ratio/high_max": 0.1648085294291377, "clip_ratio/high_mean": 0.027523898315848783, "clip_ratio/low_mean": 0.00035762626794166863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02788152452558279, "entropy": 0.11354503780603409, "epoch": 0.000768, "grad_norm": 0.007192930206656456, "kl": 0.07495404127985239, "learning_rate": 0.0001, "loss": -0.0076, "step": 96, "step_time": 81.44801545701921 }, { "clip_ratio/high_max": 0.0718371415277943, "clip_ratio/high_mean": 0.009176566891255789, "clip_ratio/low_mean": 0.0004482567746890709, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009624823651392944, "completions/clipped_ratio": 0.1640625, "completions/max_length": 16384.0, "completions/max_terminated_length": 12879.0, "completions/mean_length": 3361.4921875, "completions/mean_terminated_length": 805.6728515625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.09433631971478462, "epoch": 0.000776, "frac_reward_zero_std": 0.625, "grad_norm": 0.006469857878983021, "kl": 0.06316580064594746, "learning_rate": 0.0001, "loss": -0.0171, "num_tokens": 34458349.0, "reward": 0.3976612091064453, "reward_std": 0.47885704040527344, "rewards/reward_func/mean": 0.3976612091064453, "rewards/reward_func/std": 0.47885704040527344, "sampling/importance_sampling_ratio/max": 1.6287328004837036, "sampling/importance_sampling_ratio/mean": 0.7641345262527466, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.092411518096924, "sampling/sampling_logp_difference/mean": 0.0073972526006400585, "step": 97, "step_time": 312.0833521957975 }, { "clip_ratio/high_max": 0.0019895988516509533, "clip_ratio/high_mean": 0.000492683844640851, "clip_ratio/low_mean": 0.004362157778814435, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004854841623455286, "entropy": 0.11002197489142418, "epoch": 0.000784, "grad_norm": 0.0009743753471411765, "kl": 0.06945361755788326, "learning_rate": 0.0001, "loss": 0.0, "step": 98, "step_time": 100.79940819228068 }, { "clip_ratio/high_max": 0.00242789089679718, "clip_ratio/high_mean": 0.00032243724854197353, "clip_ratio/low_mean": 3.6235200241208076e-05, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0003586724487831816, "completions/clipped_ratio": 0.0078125, "completions/max_length": 16384.0, "completions/max_terminated_length": 9493.0, "completions/mean_length": 656.71875, "completions/mean_terminated_length": 532.8818969726562, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.12588375620543957, "epoch": 0.000792, "frac_reward_zero_std": 0.75, "grad_norm": 0.024520058184862137, "kl": 0.1035240120254457, "learning_rate": 0.0001, "loss": 0.0842, "num_tokens": 34860025.0, "reward": 0.5634695887565613, "reward_std": 0.4744718670845032, "rewards/reward_func/mean": 0.5634695887565613, "rewards/reward_func/std": 0.4744718670845032, "sampling/importance_sampling_ratio/max": 1.7210280895233154, "sampling/importance_sampling_ratio/mean": 0.9508634805679321, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.2289901971817017, "sampling/sampling_logp_difference/mean": 0.006527569144964218, "step": 99, "step_time": 239.3775063320063 }, { "clip_ratio/high_max": 0.08591295027872548, "clip_ratio/high_mean": 0.011129089194582775, "clip_ratio/low_mean": 0.005252894119621487, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016381983092287555, "entropy": 0.11453884467482567, "epoch": 0.0008, "grad_norm": 0.01832975633442402, "kl": 0.032136627938598394, "learning_rate": 0.0001, "loss": -0.0691, "step": 100, "step_time": 78.3447534351144 } ], "logging_steps": 1, "max_steps": 10000, "num_input_tokens_seen": 34860025, "num_train_epochs": 1, "save_steps": 50, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }