| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.0008, | |
| "eval_steps": 500, | |
| "global_step": 100, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1790.0, | |
| "completions/max_terminated_length": 1790.0, | |
| "completions/mean_length": 182.1484375, | |
| "completions/mean_terminated_length": 182.1484375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.28799160569906235, | |
| "epoch": 8e-06, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.015653066337108612, | |
| "kl": 9.092812547351059e-07, | |
| "learning_rate": 0.0, | |
| "loss": 0.0006, | |
| "num_tokens": 572435.0, | |
| "reward": 0.5309156775474548, | |
| "reward_std": 0.4604809284210205, | |
| "rewards/reward_func/mean": 0.5309156775474548, | |
| "rewards/reward_func/std": 0.4604808986186981, | |
| "sampling/importance_sampling_ratio/max": 1.9294580221176147, | |
| "sampling/importance_sampling_ratio/mean": 1.009232997894287, | |
| "sampling/importance_sampling_ratio/min": 0.34108418226242065, | |
| "sampling/sampling_logp_difference/max": 0.4118213653564453, | |
| "sampling/sampling_logp_difference/mean": 0.006867324002087116, | |
| "step": 1, | |
| "step_time": 92.00019569275901 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "entropy": 0.2680463492870331, | |
| "epoch": 1.6e-05, | |
| "grad_norm": 0.030799396336078644, | |
| "kl": 3.06405117722619e-07, | |
| "learning_rate": 5e-06, | |
| "loss": -0.0008, | |
| "step": 2, | |
| "step_time": 32.768778500845656 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0007134150364436209, | |
| "clip_ratio/high_mean": 8.917687955545262e-05, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 8.917687955545262e-05, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5461.0, | |
| "completions/max_terminated_length": 5461.0, | |
| "completions/mean_length": 492.9140625, | |
| "completions/mean_terminated_length": 492.9140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.2738131880760193, | |
| "epoch": 2.4e-05, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.04047725349664688, | |
| "kl": 0.00043132787686772645, | |
| "learning_rate": 1e-05, | |
| "loss": 0.0128, | |
| "num_tokens": 1023912.0, | |
| "reward": 0.36626869440078735, | |
| "reward_std": 0.465108722448349, | |
| "rewards/reward_func/mean": 0.36626869440078735, | |
| "rewards/reward_func/std": 0.465108722448349, | |
| "sampling/importance_sampling_ratio/max": 2.1430017948150635, | |
| "sampling/importance_sampling_ratio/mean": 0.9558022022247314, | |
| "sampling/importance_sampling_ratio/min": 0.020946042612195015, | |
| "sampling/sampling_logp_difference/max": 1.0668578147888184, | |
| "sampling/sampling_logp_difference/mean": 0.012038183398544788, | |
| "step": 3, | |
| "step_time": 149.4881290521007 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.000916204895474948, | |
| "clip_ratio/high_mean": 0.0001145256119343685, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0001145256119343685, | |
| "entropy": 0.3138197138905525, | |
| "epoch": 3.2e-05, | |
| "grad_norm": 0.022033225744962692, | |
| "kl": 0.0005088059915578924, | |
| "learning_rate": 1.5e-05, | |
| "loss": -0.025, | |
| "step": 4, | |
| "step_time": 58.02533454587683 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 8909.0, | |
| "completions/mean_length": 1400.8125, | |
| "completions/mean_terminated_length": 401.933349609375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.3101617470383644, | |
| "epoch": 4e-05, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.0025309158954769373, | |
| "kl": 0.0005323870427673683, | |
| "learning_rate": 2e-05, | |
| "loss": -0.0, | |
| "num_tokens": 1820264.0, | |
| "reward": 0.4536225199699402, | |
| "reward_std": 0.49777504801750183, | |
| "rewards/reward_func/mean": 0.4536225199699402, | |
| "rewards/reward_func/std": 0.49777501821517944, | |
| "sampling/importance_sampling_ratio/max": 1.2027839422225952, | |
| "sampling/importance_sampling_ratio/mean": 0.8852319717407227, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.777749061584473, | |
| "sampling/sampling_logp_difference/mean": 0.01063137874007225, | |
| "step": 5, | |
| "step_time": 239.55608439911157 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "entropy": 0.3320774510502815, | |
| "epoch": 4.8e-05, | |
| "grad_norm": 0.002884262939915061, | |
| "kl": 0.0017297266749665141, | |
| "learning_rate": 2.5e-05, | |
| "loss": 0.0, | |
| "step": 6, | |
| "step_time": 59.483061871957034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0013319647405296564, | |
| "clip_ratio/high_mean": 0.00016649559256620705, | |
| "clip_ratio/low_mean": 0.00011880823876708746, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0002853038313332945, | |
| "completions/clipped_ratio": 0.0859375, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 12890.0, | |
| "completions/mean_length": 2285.6953125, | |
| "completions/mean_terminated_length": 960.2137451171875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.28958937525749207, | |
| "epoch": 5.6e-05, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.0031948399264365435, | |
| "kl": 0.0007329104555537924, | |
| "learning_rate": 3e-05, | |
| "loss": -0.0075, | |
| "num_tokens": 2596929.0, | |
| "reward": 0.37591269612312317, | |
| "reward_std": 0.46268102526664734, | |
| "rewards/reward_func/mean": 0.37591269612312317, | |
| "rewards/reward_func/std": 0.46268102526664734, | |
| "sampling/importance_sampling_ratio/max": 1.3332258462905884, | |
| "sampling/importance_sampling_ratio/mean": 0.7790708541870117, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.7908563613891602, | |
| "sampling/sampling_logp_difference/mean": 0.011350465007126331, | |
| "step": 7, | |
| "step_time": 291.5669959378429 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.06269620433158707, | |
| "clip_ratio/high_mean": 0.007905740383648663, | |
| "clip_ratio/low_mean": 3.790311166085303e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007943643497128505, | |
| "entropy": 0.27083906158804893, | |
| "epoch": 6.4e-05, | |
| "grad_norm": 0.0021086863707751036, | |
| "kl": 0.0014875386259518564, | |
| "learning_rate": 3.5e-05, | |
| "loss": -0.0005, | |
| "step": 8, | |
| "step_time": 86.35025516920723 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006997402815613896, | |
| "clip_ratio/high_mean": 0.00015431304200319573, | |
| "clip_ratio/low_mean": 0.00010484526228538016, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0002591583006505971, | |
| "completions/clipped_ratio": 0.0703125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14202.0, | |
| "completions/mean_length": 2823.390625, | |
| "completions/mean_terminated_length": 1797.7984619140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.14772238209843636, | |
| "epoch": 7.2e-05, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.006001872010529041, | |
| "kl": 0.000943778213695623, | |
| "learning_rate": 4e-05, | |
| "loss": 0.0108, | |
| "num_tokens": 3207611.0, | |
| "reward": 0.5036642551422119, | |
| "reward_std": 0.49157199263572693, | |
| "rewards/reward_func/mean": 0.5036642551422119, | |
| "rewards/reward_func/std": 0.49157199263572693, | |
| "sampling/importance_sampling_ratio/max": 1.5270230770111084, | |
| "sampling/importance_sampling_ratio/mean": 0.7227185964584351, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.1080689430236816, | |
| "sampling/sampling_logp_difference/mean": 0.004280484281480312, | |
| "step": 9, | |
| "step_time": 252.9167389899958 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.001605564437340945, | |
| "clip_ratio/high_mean": 0.00024234261945821345, | |
| "clip_ratio/low_mean": 9.370225598104298e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00033604487907723524, | |
| "entropy": 0.19918649271130562, | |
| "epoch": 8e-05, | |
| "grad_norm": 0.0017423235112801194, | |
| "kl": 0.0017632123199291527, | |
| "learning_rate": 4.5e-05, | |
| "loss": 0.0011, | |
| "step": 10, | |
| "step_time": 71.32449517771602 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0016184170162887312, | |
| "clip_ratio/high_mean": 0.0002023021270360914, | |
| "clip_ratio/low_mean": 2.689328721316997e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00022919541515875608, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15886.0, | |
| "completions/mean_length": 1933.03125, | |
| "completions/mean_terminated_length": 708.3728637695312, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.283096544444561, | |
| "epoch": 8.8e-05, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.005225648172199726, | |
| "kl": 0.020552616333588958, | |
| "learning_rate": 5e-05, | |
| "loss": -0.0047, | |
| "num_tokens": 3911607.0, | |
| "reward": 0.3723485767841339, | |
| "reward_std": 0.45915600657463074, | |
| "rewards/reward_func/mean": 0.3723485767841339, | |
| "rewards/reward_func/std": 0.45915600657463074, | |
| "sampling/importance_sampling_ratio/max": 2.6595818996429443, | |
| "sampling/importance_sampling_ratio/mean": 0.8330748081207275, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 4.817470550537109, | |
| "sampling/sampling_logp_difference/mean": 0.018553579226136208, | |
| "step": 11, | |
| "step_time": 305.5446167134214 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.11326734800240956, | |
| "clip_ratio/high_mean": 0.014203241193172289, | |
| "clip_ratio/low_mean": 0.02032394427806139, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03452718561311485, | |
| "entropy": 0.31704550981521606, | |
| "epoch": 9.6e-05, | |
| "grad_norm": 0.0021242008078843355, | |
| "kl": 0.041884748614393175, | |
| "learning_rate": 5.500000000000001e-05, | |
| "loss": -0.0024, | |
| "step": 12, | |
| "step_time": 100.18367045000196 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05000000074505806, | |
| "clip_ratio/high_mean": 0.0062500000931322575, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0062500000931322575, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14215.0, | |
| "completions/mean_length": 994.609375, | |
| "completions/mean_terminated_length": 498.1773986816406, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.21508953720331192, | |
| "epoch": 0.000104, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.008786521852016449, | |
| "kl": 0.010858730238396674, | |
| "learning_rate": 6e-05, | |
| "loss": 0.0112, | |
| "num_tokens": 4508533.0, | |
| "reward": 0.49886971712112427, | |
| "reward_std": 0.479078471660614, | |
| "rewards/reward_func/mean": 0.49886971712112427, | |
| "rewards/reward_func/std": 0.479078471660614, | |
| "sampling/importance_sampling_ratio/max": 1.2790054082870483, | |
| "sampling/importance_sampling_ratio/mean": 0.9344986081123352, | |
| "sampling/importance_sampling_ratio/min": 1.8428319634167245e-11, | |
| "sampling/sampling_logp_difference/max": 1.4877722263336182, | |
| "sampling/sampling_logp_difference/mean": 0.008521802723407745, | |
| "step": 13, | |
| "step_time": 443.75575554184616 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.20842555643321248, | |
| "clip_ratio/high_mean": 0.031284590383620525, | |
| "clip_ratio/low_mean": 0.03960632954840548, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.07089091930538416, | |
| "entropy": 0.18631769344210625, | |
| "epoch": 0.000112, | |
| "grad_norm": 0.003839960554614663, | |
| "kl": 0.3403822723776102, | |
| "learning_rate": 6.500000000000001e-05, | |
| "loss": -0.0202, | |
| "step": 14, | |
| "step_time": 85.95637208526023 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0001768385773175396, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0001768385773175396, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 5468.0, | |
| "completions/max_terminated_length": 5468.0, | |
| "completions/mean_length": 578.9375, | |
| "completions/mean_terminated_length": 578.9375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.159299585968256, | |
| "epoch": 0.00012, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.012311691418290138, | |
| "kl": 0.01162221294362098, | |
| "learning_rate": 7e-05, | |
| "loss": 0.0128, | |
| "num_tokens": 5117069.0, | |
| "reward": 0.5088721513748169, | |
| "reward_std": 0.4710608422756195, | |
| "rewards/reward_func/mean": 0.5088721513748169, | |
| "rewards/reward_func/std": 0.4710608124732971, | |
| "sampling/importance_sampling_ratio/max": 2.0296452045440674, | |
| "sampling/importance_sampling_ratio/mean": 0.9467288255691528, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 6.81820821762085, | |
| "sampling/sampling_logp_difference/mean": 0.011304730549454689, | |
| "step": 15, | |
| "step_time": 139.54756120592356 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1666666716337204, | |
| "clip_ratio/high_mean": 0.02633665595203638, | |
| "clip_ratio/low_mean": 0.027520230985828675, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.053856888320297, | |
| "entropy": 0.24024979025125504, | |
| "epoch": 0.000128, | |
| "grad_norm": 0.004696316551417112, | |
| "kl": 0.04927169228903949, | |
| "learning_rate": 7.500000000000001e-05, | |
| "loss": 0.0008, | |
| "step": 16, | |
| "step_time": 51.39879226591438 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.001534179231384769, | |
| "clip_ratio/high_mean": 0.00019177240392309614, | |
| "clip_ratio/low_mean": 5.4063617426436394e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00024583602498751134, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 12325.0, | |
| "completions/mean_length": 2109.71875, | |
| "completions/mean_terminated_length": 900.0338745117188, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.259117666631937, | |
| "epoch": 0.000136, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.0027392900083214045, | |
| "kl": 0.022709450451657176, | |
| "learning_rate": 8e-05, | |
| "loss": -0.0015, | |
| "num_tokens": 5846785.0, | |
| "reward": 0.5021514892578125, | |
| "reward_std": 0.4946684241294861, | |
| "rewards/reward_func/mean": 0.5021514892578125, | |
| "rewards/reward_func/std": 0.4946684241294861, | |
| "sampling/importance_sampling_ratio/max": 1.3155046701431274, | |
| "sampling/importance_sampling_ratio/mean": 0.8164673447608948, | |
| "sampling/importance_sampling_ratio/min": 3.531794431563262e-12, | |
| "sampling/sampling_logp_difference/max": 1.9771251678466797, | |
| "sampling/sampling_logp_difference/mean": 0.010305984877049923, | |
| "step": 17, | |
| "step_time": 405.8358749570325 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1666666716337204, | |
| "clip_ratio/high_mean": 0.021097486838698387, | |
| "clip_ratio/low_mean": 0.011532710865139961, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03263019863516092, | |
| "entropy": 0.1985614150762558, | |
| "epoch": 0.000144, | |
| "grad_norm": 0.005495882593095303, | |
| "kl": 0.03126369323581457, | |
| "learning_rate": 8.5e-05, | |
| "loss": -0.0155, | |
| "step": 18, | |
| "step_time": 142.6410668361932 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0031972584838513285, | |
| "clip_ratio/high_mean": 0.0005761296160926577, | |
| "clip_ratio/low_mean": 0.0004054550954606384, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0009815847006393597, | |
| "completions/clipped_ratio": 0.1796875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 16000.0, | |
| "completions/mean_length": 4290.2109375, | |
| "completions/mean_terminated_length": 1641.0953369140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.09939680807292461, | |
| "epoch": 0.000152, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.004325419198721647, | |
| "kl": 0.025787187332753092, | |
| "learning_rate": 9e-05, | |
| "loss": 0.0035, | |
| "num_tokens": 6617028.0, | |
| "reward": 0.4427623152732849, | |
| "reward_std": 0.4857458472251892, | |
| "rewards/reward_func/mean": 0.4427623152732849, | |
| "rewards/reward_func/std": 0.4857458472251892, | |
| "sampling/importance_sampling_ratio/max": 1.2794967889785767, | |
| "sampling/importance_sampling_ratio/mean": 0.6535032987594604, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 3.10308837890625, | |
| "sampling/sampling_logp_difference/mean": 0.007126981392502785, | |
| "step": 19, | |
| "step_time": 253.69875198067166 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.002380200894549489, | |
| "clip_ratio/high_mean": 0.0004641784689738415, | |
| "clip_ratio/low_mean": 0.006076709658373147, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006540888105519116, | |
| "entropy": 0.10411721095442772, | |
| "epoch": 0.00016, | |
| "grad_norm": 0.0034780765417963266, | |
| "kl": 0.08655104972422123, | |
| "learning_rate": 9.5e-05, | |
| "loss": -0.0049, | |
| "step": 20, | |
| "step_time": 54.08613259694539 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.0052374619990587234, | |
| "clip_ratio/low_mean": 7.05718994140625e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.005308033898472786, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14426.0, | |
| "completions/mean_length": 2772.7421875, | |
| "completions/mean_terminated_length": 545.4454345703125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.26159290969371796, | |
| "epoch": 0.000168, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 0.0016806161729618907, | |
| "kl": 0.13244479056447744, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0032, | |
| "num_tokens": 7608123.0, | |
| "reward": 0.30994826555252075, | |
| "reward_std": 0.4333060681819916, | |
| "rewards/reward_func/mean": 0.30994826555252075, | |
| "rewards/reward_func/std": 0.4333060681819916, | |
| "sampling/importance_sampling_ratio/max": 2.071073532104492, | |
| "sampling/importance_sampling_ratio/mean": 0.8328644633293152, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.6652194261550903, | |
| "sampling/sampling_logp_difference/mean": 0.013313735835254192, | |
| "step": 21, | |
| "step_time": 408.48256488214247 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0009995178115786985, | |
| "clip_ratio/high_mean": 0.0001249397264473373, | |
| "clip_ratio/low_mean": 0.01684358110651374, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01696852078748634, | |
| "entropy": 0.21135510131716728, | |
| "epoch": 0.000176, | |
| "grad_norm": 0.011167092248797417, | |
| "kl": 0.07416732516139746, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0403, | |
| "step": 22, | |
| "step_time": 148.2932638968341 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006121824262663722, | |
| "clip_ratio/high_mean": 7.652280328329653e-05, | |
| "clip_ratio/low_mean": 0.0001096606720238924, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00018618347530718893, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 16332.0, | |
| "completions/mean_length": 3878.53125, | |
| "completions/mean_terminated_length": 1283.056640625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.17607736214995384, | |
| "epoch": 0.000184, | |
| "frac_reward_zero_std": 0.6875, | |
| "grad_norm": 0.0007014994043856859, | |
| "kl": 0.027259970782324672, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0, | |
| "num_tokens": 8549975.0, | |
| "reward": 0.4119563102722168, | |
| "reward_std": 0.4680332541465759, | |
| "rewards/reward_func/mean": 0.4119563102722168, | |
| "rewards/reward_func/std": 0.4680332839488983, | |
| "sampling/importance_sampling_ratio/max": 1.453569769859314, | |
| "sampling/importance_sampling_ratio/mean": 0.7403019666671753, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.4103574752807617, | |
| "sampling/sampling_logp_difference/mean": 0.00972401350736618, | |
| "step": 23, | |
| "step_time": 281.3584946768824 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0006908245850354433, | |
| "clip_ratio/high_mean": 8.635307312943041e-05, | |
| "clip_ratio/low_mean": 0.00011251296746195294, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00019886604059138335, | |
| "entropy": 0.21833522990345955, | |
| "epoch": 0.000192, | |
| "grad_norm": 0.0062239160761237144, | |
| "kl": 0.06342287547886372, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0208, | |
| "step": 24, | |
| "step_time": 83.19132332946174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3324.0, | |
| "completions/max_terminated_length": 3324.0, | |
| "completions/mean_length": 350.3125, | |
| "completions/mean_terminated_length": 350.3125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.21079584956169128, | |
| "epoch": 0.0002, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.01816585287451744, | |
| "kl": 0.047579593025147915, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0, | |
| "num_tokens": 9024447.0, | |
| "reward": 0.5932583212852478, | |
| "reward_std": 0.4719974398612976, | |
| "rewards/reward_func/mean": 0.5932583212852478, | |
| "rewards/reward_func/std": 0.4719974100589752, | |
| "sampling/importance_sampling_ratio/max": 2.610856533050537, | |
| "sampling/importance_sampling_ratio/mean": 0.9387929439544678, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8608701229095459, | |
| "sampling/sampling_logp_difference/mean": 0.012685808353126049, | |
| "step": 25, | |
| "step_time": 86.05315418587998 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.2500000037252903, | |
| "clip_ratio/high_mean": 0.06541509041562676, | |
| "clip_ratio/low_mean": 0.0699066836386919, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.13532177731394768, | |
| "entropy": 0.16260286793112755, | |
| "epoch": 0.000208, | |
| "grad_norm": 0.009579629637300968, | |
| "kl": 1.1700078528374434, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0004, | |
| "step": 26, | |
| "step_time": 34.18811777303927 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004500097595155239, | |
| "clip_ratio/high_mean": 0.0007490973512176424, | |
| "clip_ratio/low_mean": 2.6662485652195755e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0007757598377793329, | |
| "completions/clipped_ratio": 0.078125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 11635.0, | |
| "completions/mean_length": 2257.7578125, | |
| "completions/mean_terminated_length": 1060.61865234375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.1516659539192915, | |
| "epoch": 0.000216, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0051688519306480885, | |
| "kl": 0.047932930290699005, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0004, | |
| "num_tokens": 9745576.0, | |
| "reward": 0.46375519037246704, | |
| "reward_std": 0.48423123359680176, | |
| "rewards/reward_func/mean": 0.46375519037246704, | |
| "rewards/reward_func/std": 0.48423123359680176, | |
| "sampling/importance_sampling_ratio/max": 2.536548376083374, | |
| "sampling/importance_sampling_ratio/mean": 0.8151211738586426, | |
| "sampling/importance_sampling_ratio/min": 5.257729753793683e-06, | |
| "sampling/sampling_logp_difference/max": 6.129981994628906, | |
| "sampling/sampling_logp_difference/mean": 0.005175800062716007, | |
| "step": 27, | |
| "step_time": 436.48245814908296 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08695227152202278, | |
| "clip_ratio/high_mean": 0.011061059150961228, | |
| "clip_ratio/low_mean": 0.010714802792790579, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021775862463982776, | |
| "entropy": 0.13691149465739727, | |
| "epoch": 0.000224, | |
| "grad_norm": 0.024410562589764595, | |
| "kl": 0.03237813455052674, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0973, | |
| "step": 28, | |
| "step_time": 156.84757056180388 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0003620828501880169, | |
| "clip_ratio/high_mean": 6.965760076127481e-05, | |
| "clip_ratio/low_mean": 0.0001634064483369002, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00023306404909817502, | |
| "completions/clipped_ratio": 0.0703125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15615.0, | |
| "completions/mean_length": 2270.578125, | |
| "completions/mean_terminated_length": 1203.176513671875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.25759194791316986, | |
| "epoch": 0.000232, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.012714912183582783, | |
| "kl": 0.10287779942154884, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0209, | |
| "num_tokens": 10600314.0, | |
| "reward": 0.4594896733760834, | |
| "reward_std": 0.48041772842407227, | |
| "rewards/reward_func/mean": 0.4594896733760834, | |
| "rewards/reward_func/std": 0.4804176688194275, | |
| "sampling/importance_sampling_ratio/max": 2.5663199424743652, | |
| "sampling/importance_sampling_ratio/mean": 0.8002752065658569, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.7162349224090576, | |
| "sampling/sampling_logp_difference/mean": 0.015552837401628494, | |
| "step": 29, | |
| "step_time": 263.5061617055908 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04263468802673742, | |
| "clip_ratio/high_mean": 0.005374936816224363, | |
| "clip_ratio/low_mean": 0.01608989532542182, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021464832054334693, | |
| "entropy": 0.2745072916150093, | |
| "epoch": 0.00024, | |
| "grad_norm": 0.0038172281347215176, | |
| "kl": 0.09374767541885376, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0059, | |
| "step": 30, | |
| "step_time": 65.82626755977981 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0027790770400315523, | |
| "clip_ratio/high_mean": 0.00034738463000394404, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00034738463000394404, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2434.0, | |
| "completions/max_terminated_length": 2434.0, | |
| "completions/mean_length": 264.4453125, | |
| "completions/mean_terminated_length": 264.4453125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.23240000382065773, | |
| "epoch": 0.000248, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.018018538132309914, | |
| "kl": 0.07072961144149303, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0044, | |
| "num_tokens": 11270435.0, | |
| "reward": 0.5666041970252991, | |
| "reward_std": 0.49271145462989807, | |
| "rewards/reward_func/mean": 0.5666041970252991, | |
| "rewards/reward_func/std": 0.4927114248275757, | |
| "sampling/importance_sampling_ratio/max": 1.8200021982192993, | |
| "sampling/importance_sampling_ratio/mean": 0.97586989402771, | |
| "sampling/importance_sampling_ratio/min": 0.1302126795053482, | |
| "sampling/sampling_logp_difference/max": 0.7188519239425659, | |
| "sampling/sampling_logp_difference/mean": 0.010895353741943836, | |
| "step": 31, | |
| "step_time": 103.47118871309794 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.22714198799803853, | |
| "clip_ratio/high_mean": 0.06246224191272631, | |
| "clip_ratio/low_mean": 0.05428445339202881, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.11674669571220875, | |
| "entropy": 0.202346783131361, | |
| "epoch": 0.000256, | |
| "grad_norm": 0.03462144732475281, | |
| "kl": 0.26291508600115776, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0331, | |
| "step": 32, | |
| "step_time": 44.26117577217519 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 1344.0, | |
| "completions/mean_length": 3158.328125, | |
| "completions/mean_terminated_length": 106.25000762939453, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.3173985183238983, | |
| "epoch": 0.000264, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.0015116184949874878, | |
| "kl": 0.17226483300328255, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "num_tokens": 12009445.0, | |
| "reward": 0.37367647886276245, | |
| "reward_std": 0.4661514163017273, | |
| "rewards/reward_func/mean": 0.37367647886276245, | |
| "rewards/reward_func/std": 0.4661514163017273, | |
| "sampling/importance_sampling_ratio/max": 1.266974687576294, | |
| "sampling/importance_sampling_ratio/mean": 0.8097731471061707, | |
| "sampling/importance_sampling_ratio/min": 5.553430160176731e-09, | |
| "sampling/sampling_logp_difference/max": 13.181495666503906, | |
| "sampling/sampling_logp_difference/mean": 0.015717996284365654, | |
| "step": 33, | |
| "step_time": 252.64742845576257 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.2083333395421505, | |
| "clip_ratio/high_mean": 0.026041667442768812, | |
| "clip_ratio/low_mean": 0.015625000465661287, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04166666744276881, | |
| "entropy": 0.3447694480419159, | |
| "epoch": 0.000272, | |
| "grad_norm": 0.0012050194200128317, | |
| "kl": 0.09908118983730674, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0, | |
| "step": 34, | |
| "step_time": 52.40133061888628 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 10039.0, | |
| "completions/max_terminated_length": 10039.0, | |
| "completions/mean_length": 676.140625, | |
| "completions/mean_terminated_length": 676.140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.36223024874925613, | |
| "epoch": 0.00028, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.006729471497237682, | |
| "kl": 0.2994176782667637, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0002, | |
| "num_tokens": 12784223.0, | |
| "reward": 0.35155534744262695, | |
| "reward_std": 0.46008607745170593, | |
| "rewards/reward_func/mean": 0.35155534744262695, | |
| "rewards/reward_func/std": 0.46008607745170593, | |
| "sampling/importance_sampling_ratio/max": 1.6923160552978516, | |
| "sampling/importance_sampling_ratio/mean": 0.9230321645736694, | |
| "sampling/importance_sampling_ratio/min": 4.0193415544627353e-13, | |
| "sampling/sampling_logp_difference/max": 2.2721805572509766, | |
| "sampling/sampling_logp_difference/mean": 0.019083332270383835, | |
| "step": 35, | |
| "step_time": 156.88474073121324 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.21071429178118706, | |
| "clip_ratio/high_mean": 0.04456845438107848, | |
| "clip_ratio/low_mean": 0.042559525929391384, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.08712797984480858, | |
| "entropy": 0.3039735332131386, | |
| "epoch": 0.000288, | |
| "grad_norm": 0.002629757858812809, | |
| "kl": 1.421083964407444, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0001, | |
| "step": 36, | |
| "step_time": 47.79177230759524 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00212460938928416, | |
| "clip_ratio/high_mean": 0.0004709042623289861, | |
| "clip_ratio/low_mean": 0.00013680529809789732, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006077095604268834, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14803.0, | |
| "completions/mean_length": 1618.3046875, | |
| "completions/mean_terminated_length": 1383.9287109375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.205389566719532, | |
| "epoch": 0.000296, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.034038130193948746, | |
| "kl": 0.1238506119698286, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0496, | |
| "num_tokens": 13562182.0, | |
| "reward": 0.5307304859161377, | |
| "reward_std": 0.4933399260044098, | |
| "rewards/reward_func/mean": 0.5307304859161377, | |
| "rewards/reward_func/std": 0.4933399558067322, | |
| "sampling/importance_sampling_ratio/max": 2.9194023609161377, | |
| "sampling/importance_sampling_ratio/mean": 0.8693970441818237, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 6.139076232910156, | |
| "sampling/sampling_logp_difference/mean": 0.00930335745215416, | |
| "step": 37, | |
| "step_time": 304.38122520502657 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.12984464410692453, | |
| "clip_ratio/high_mean": 0.017246030358364806, | |
| "clip_ratio/low_mean": 0.04759028274565935, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06483631394803524, | |
| "entropy": 0.19659215956926346, | |
| "epoch": 0.000304, | |
| "grad_norm": 0.014286670833826065, | |
| "kl": 0.09789336752146482, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0889, | |
| "step": 38, | |
| "step_time": 108.92287370702252 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.002319882420124486, | |
| "clip_ratio/high_mean": 0.0004717662050097715, | |
| "clip_ratio/low_mean": 4.9786372983362526e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005215525743551552, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14867.0, | |
| "completions/mean_length": 2032.5, | |
| "completions/mean_terminated_length": 1569.54833984375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.19366027787327766, | |
| "epoch": 0.000312, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.02529173344373703, | |
| "kl": 0.12496417853981256, | |
| "learning_rate": 0.0001, | |
| "loss": -0.1314, | |
| "num_tokens": 14363022.0, | |
| "reward": 0.6012634038925171, | |
| "reward_std": 0.46494749188423157, | |
| "rewards/reward_func/mean": 0.6012634038925171, | |
| "rewards/reward_func/std": 0.46494749188423157, | |
| "sampling/importance_sampling_ratio/max": 2.67891263961792, | |
| "sampling/importance_sampling_ratio/mean": 0.8825238943099976, | |
| "sampling/importance_sampling_ratio/min": 9.072877865667905e-12, | |
| "sampling/sampling_logp_difference/max": 2.937361478805542, | |
| "sampling/sampling_logp_difference/mean": 0.007425494492053986, | |
| "step": 39, | |
| "step_time": 242.51036442094482 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004297120030969381, | |
| "clip_ratio/high_mean": 0.0005371400038711727, | |
| "clip_ratio/low_mean": 0.019620355626102537, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.020157495309831575, | |
| "entropy": 0.2599617578089237, | |
| "epoch": 0.00032, | |
| "grad_norm": 0.003947969060391188, | |
| "kl": 0.17537523806095123, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0074, | |
| "step": 40, | |
| "step_time": 64.70451782317832 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05252361833117902, | |
| "clip_ratio/high_mean": 0.00663674037787132, | |
| "clip_ratio/low_mean": 0.0005198562575969845, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007156596751883626, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 12109.0, | |
| "completions/mean_length": 2499.59375, | |
| "completions/mean_terminated_length": 516.107177734375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.21654297411441803, | |
| "epoch": 0.000328, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.0020191774237900972, | |
| "kl": 0.050669580698013306, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0037, | |
| "num_tokens": 15148602.0, | |
| "reward": 0.41135668754577637, | |
| "reward_std": 0.4713096022605896, | |
| "rewards/reward_func/mean": 0.41135668754577637, | |
| "rewards/reward_func/std": 0.4713096022605896, | |
| "sampling/importance_sampling_ratio/max": 1.3856815099716187, | |
| "sampling/importance_sampling_ratio/mean": 0.8404459953308105, | |
| "sampling/importance_sampling_ratio/min": 8.073855711603073e-14, | |
| "sampling/sampling_logp_difference/max": 3.5181496143341064, | |
| "sampling/sampling_logp_difference/mean": 0.010210744105279446, | |
| "step": 41, | |
| "step_time": 249.73666059714742 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.051496061148100125, | |
| "clip_ratio/high_mean": 0.00651522628334078, | |
| "clip_ratio/low_mean": 0.010954441386274993, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.017469667189288884, | |
| "entropy": 0.22261176258325577, | |
| "epoch": 0.000336, | |
| "grad_norm": 0.01327864546328783, | |
| "kl": 0.08568831626325846, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0071, | |
| "step": 42, | |
| "step_time": 65.51700961985625 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2104.0, | |
| "completions/max_terminated_length": 2104.0, | |
| "completions/mean_length": 337.921875, | |
| "completions/mean_terminated_length": 337.921875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.18646935559809208, | |
| "epoch": 0.000344, | |
| "frac_reward_zero_std": 0.8125, | |
| "grad_norm": 0.009093403816223145, | |
| "kl": 0.04625028697773814, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0001, | |
| "num_tokens": 15697984.0, | |
| "reward": 0.5859714150428772, | |
| "reward_std": 0.48348718881607056, | |
| "rewards/reward_func/mean": 0.5859714150428772, | |
| "rewards/reward_func/std": 0.48348718881607056, | |
| "sampling/importance_sampling_ratio/max": 2.9582595825195312, | |
| "sampling/importance_sampling_ratio/mean": 0.9583175182342529, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.8077512979507446, | |
| "sampling/sampling_logp_difference/mean": 0.01057741791009903, | |
| "step": 43, | |
| "step_time": 83.54059210349806 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.02708333358168602, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02708333358168602, | |
| "entropy": 0.1239668894559145, | |
| "epoch": 0.000352, | |
| "grad_norm": 0.002130000153556466, | |
| "kl": 0.7455503353849053, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 44, | |
| "step_time": 39.00458105024882 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 7439.0, | |
| "completions/max_terminated_length": 7439.0, | |
| "completions/mean_length": 794.125, | |
| "completions/mean_terminated_length": 794.125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.17747851833701134, | |
| "epoch": 0.00036, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.009445400908589363, | |
| "kl": 0.19956867769360542, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "num_tokens": 16409168.0, | |
| "reward": 0.3401130437850952, | |
| "reward_std": 0.44747307896614075, | |
| "rewards/reward_func/mean": 0.3401130437850952, | |
| "rewards/reward_func/std": 0.44747307896614075, | |
| "sampling/importance_sampling_ratio/max": 1.213844895362854, | |
| "sampling/importance_sampling_ratio/mean": 0.8736224174499512, | |
| "sampling/importance_sampling_ratio/min": 1.1355460628692526e-05, | |
| "sampling/sampling_logp_difference/max": 1.4877896308898926, | |
| "sampling/sampling_logp_difference/mean": 0.009566227905452251, | |
| "step": 45, | |
| "step_time": 112.80105081154034 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.005241899751126766, | |
| "clip_ratio/low_mean": 0.05211690114811063, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05735880043357611, | |
| "entropy": 0.11628733202815056, | |
| "epoch": 0.000368, | |
| "grad_norm": 0.0010464430088177323, | |
| "kl": 0.5155483353883028, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0001, | |
| "step": 46, | |
| "step_time": 33.51861782022752 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0008075302612269297, | |
| "clip_ratio/high_mean": 0.00010094128265336622, | |
| "clip_ratio/low_mean": 0.00017487616059952416, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0002758174450718798, | |
| "completions/clipped_ratio": 0.1171875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14041.0, | |
| "completions/mean_length": 2758.3125, | |
| "completions/mean_terminated_length": 949.5928955078125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.20697598904371262, | |
| "epoch": 0.000376, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.008048626594245434, | |
| "kl": 0.31821640580892563, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0341, | |
| "num_tokens": 17250760.0, | |
| "reward": 0.4497794508934021, | |
| "reward_std": 0.47367680072784424, | |
| "rewards/reward_func/mean": 0.4497794508934021, | |
| "rewards/reward_func/std": 0.4736768305301666, | |
| "sampling/importance_sampling_ratio/max": 1.4071918725967407, | |
| "sampling/importance_sampling_ratio/mean": 0.7693284749984741, | |
| "sampling/importance_sampling_ratio/min": 6.366646576258517e-14, | |
| "sampling/sampling_logp_difference/max": 2.0170488357543945, | |
| "sampling/sampling_logp_difference/mean": 0.014924651943147182, | |
| "step": 47, | |
| "step_time": 265.7373479530215 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.18790940550388768, | |
| "clip_ratio/high_mean": 0.04044250077276956, | |
| "clip_ratio/low_mean": 0.023039879743009806, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.06348238191276323, | |
| "entropy": 0.20131932944059372, | |
| "epoch": 0.000384, | |
| "grad_norm": 0.002510676858946681, | |
| "kl": 0.3020637482404709, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0018, | |
| "step": 48, | |
| "step_time": 63.53406945313327 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 2.0148290786892176e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 2.0148290786892176e-05, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 1549.0, | |
| "completions/max_terminated_length": 1549.0, | |
| "completions/mean_length": 194.90625, | |
| "completions/mean_terminated_length": 194.90625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.16713403165340424, | |
| "epoch": 0.000392, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.03661969304084778, | |
| "kl": 0.27568595856428146, | |
| "learning_rate": 0.0001, | |
| "loss": 0.1756, | |
| "num_tokens": 17609516.0, | |
| "reward": 0.3761705458164215, | |
| "reward_std": 0.4686991274356842, | |
| "rewards/reward_func/mean": 0.3761705458164215, | |
| "rewards/reward_func/std": 0.4686991274356842, | |
| "sampling/importance_sampling_ratio/max": 2.2934863567352295, | |
| "sampling/importance_sampling_ratio/mean": 1.0030004978179932, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.8843307495117188, | |
| "sampling/sampling_logp_difference/mean": 0.011824723333120346, | |
| "step": 49, | |
| "step_time": 33.989881575806066 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.18333333730697632, | |
| "clip_ratio/high_mean": 0.03967476915568113, | |
| "clip_ratio/low_mean": 0.06265822611749172, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.10233299620449543, | |
| "entropy": 0.11747358739376068, | |
| "epoch": 0.0004, | |
| "grad_norm": 520.3370971679688, | |
| "kl": 138366.12171524763, | |
| "learning_rate": 0.0001, | |
| "loss": 0.9764, | |
| "step": 50, | |
| "step_time": 11.585765323834494 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.001699419430224225, | |
| "clip_ratio/high_mean": 0.00021242742877802812, | |
| "clip_ratio/low_mean": 8.518956747138873e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00029761699261143804, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14391.0, | |
| "completions/mean_length": 1964.4296875, | |
| "completions/mean_terminated_length": 1499.2822265625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.19312381371855736, | |
| "epoch": 0.000408, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.004857824184000492, | |
| "kl": 0.11069730296730995, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0, | |
| "num_tokens": 18231323.0, | |
| "reward": 0.5406150221824646, | |
| "reward_std": 0.474507600069046, | |
| "rewards/reward_func/mean": 0.5406150221824646, | |
| "rewards/reward_func/std": 0.474507600069046, | |
| "sampling/importance_sampling_ratio/max": 1.754118800163269, | |
| "sampling/importance_sampling_ratio/mean": 0.8371249437332153, | |
| "sampling/importance_sampling_ratio/min": 2.1961432238731815e-12, | |
| "sampling/sampling_logp_difference/max": 1.711458444595337, | |
| "sampling/sampling_logp_difference/mean": 0.011138837784528732, | |
| "step": 51, | |
| "step_time": 234.7052824080456 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.052048374724108726, | |
| "clip_ratio/high_mean": 0.006506046840513591, | |
| "clip_ratio/low_mean": 0.0005927347665419802, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007098781847162172, | |
| "entropy": 0.18790747597813606, | |
| "epoch": 0.000416, | |
| "grad_norm": 0.02879762463271618, | |
| "kl": 0.14645008742809296, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0793, | |
| "step": 52, | |
| "step_time": 67.58970385813154 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0011903044069185853, | |
| "clip_ratio/high_mean": 0.00014878805086482316, | |
| "clip_ratio/low_mean": 0.00016782619059085846, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003166142414556816, | |
| "completions/clipped_ratio": 0.015625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14571.0, | |
| "completions/mean_length": 1242.5234375, | |
| "completions/mean_terminated_length": 1002.1826171875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.2562270388007164, | |
| "epoch": 0.000424, | |
| "frac_reward_zero_std": 0.4375, | |
| "grad_norm": 0.003293583169579506, | |
| "kl": 0.14673679322004318, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0, | |
| "num_tokens": 18904638.0, | |
| "reward": 0.515934944152832, | |
| "reward_std": 0.4869852364063263, | |
| "rewards/reward_func/mean": 0.515934944152832, | |
| "rewards/reward_func/std": 0.4869852364063263, | |
| "sampling/importance_sampling_ratio/max": 2.125849485397339, | |
| "sampling/importance_sampling_ratio/mean": 0.8796348571777344, | |
| "sampling/importance_sampling_ratio/min": 1.6146490811053127e-09, | |
| "sampling/sampling_logp_difference/max": 2.236321210861206, | |
| "sampling/sampling_logp_difference/mean": 0.00907333567738533, | |
| "step": 53, | |
| "step_time": 403.6396517488174 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004773067426867783, | |
| "clip_ratio/high_mean": 0.0005966334283584729, | |
| "clip_ratio/low_mean": 9.633062768443779e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006929640658199787, | |
| "entropy": 0.16939815878868103, | |
| "epoch": 0.000432, | |
| "grad_norm": 0.005178892519325018, | |
| "kl": 0.12158099561929703, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0092, | |
| "step": 54, | |
| "step_time": 155.7508629639633 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00413007679162547, | |
| "clip_ratio/high_mean": 0.0005162595989531837, | |
| "clip_ratio/low_mean": 0.00044288246863288805, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.000959142082137987, | |
| "completions/clipped_ratio": 0.1640625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15599.0, | |
| "completions/mean_length": 4115.78125, | |
| "completions/mean_terminated_length": 1708.0, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.14124679006636143, | |
| "epoch": 0.00044, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0008667530491948128, | |
| "kl": 0.15955708548426628, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0005, | |
| "num_tokens": 19689170.0, | |
| "reward": 0.4943884015083313, | |
| "reward_std": 0.4778369963169098, | |
| "rewards/reward_func/mean": 0.4943884015083313, | |
| "rewards/reward_func/std": 0.4778369963169098, | |
| "sampling/importance_sampling_ratio/max": 1.5920473337173462, | |
| "sampling/importance_sampling_ratio/mean": 0.6500886082649231, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.026095390319824, | |
| "sampling/sampling_logp_difference/mean": 0.006507984362542629, | |
| "step": 55, | |
| "step_time": 295.36478371801786 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.006438895943574607, | |
| "clip_ratio/high_mean": 0.0010595864005153999, | |
| "clip_ratio/low_mean": 0.0001230314956046641, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0011826178670162335, | |
| "entropy": 0.16170401498675346, | |
| "epoch": 0.000448, | |
| "grad_norm": 0.0010976437479257584, | |
| "kl": 0.14007344283163548, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0014, | |
| "step": 56, | |
| "step_time": 87.99333154270425 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0005311340792104602, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0005311340792104602, | |
| "completions/clipped_ratio": 0.1171875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 10352.0, | |
| "completions/mean_length": 2540.15625, | |
| "completions/mean_terminated_length": 702.4778442382812, | |
| "completions/min_length": 3.0, | |
| "completions/min_terminated_length": 3.0, | |
| "entropy": 0.09304443560540676, | |
| "epoch": 0.000456, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0008479373645968735, | |
| "kl": 0.06781758088618517, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "num_tokens": 20376822.0, | |
| "reward": 0.5963060855865479, | |
| "reward_std": 0.4900885820388794, | |
| "rewards/reward_func/mean": 0.5963060855865479, | |
| "rewards/reward_func/std": 0.4900885820388794, | |
| "sampling/importance_sampling_ratio/max": 1.911750316619873, | |
| "sampling/importance_sampling_ratio/mean": 0.787479043006897, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.4092633724212646, | |
| "sampling/sampling_logp_difference/mean": 0.004730356857180595, | |
| "step": 57, | |
| "step_time": 275.88833857746795 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0007761355664115399, | |
| "clip_ratio/high_mean": 9.701694580144249e-05, | |
| "clip_ratio/low_mean": 0.0005300462580635212, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006270632075029425, | |
| "entropy": 0.07158766873180866, | |
| "epoch": 0.000464, | |
| "grad_norm": 0.0023808805271983147, | |
| "kl": 0.058913652785122395, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0026, | |
| "step": 58, | |
| "step_time": 78.15128924208693 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0045549869537353516, | |
| "clip_ratio/high_mean": 0.0005933154025115073, | |
| "clip_ratio/low_mean": 0.0014756222371943295, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0020689376979134977, | |
| "completions/clipped_ratio": 0.1015625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14099.0, | |
| "completions/mean_length": 2205.8359375, | |
| "completions/mean_terminated_length": 603.0869140625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.12633545510470867, | |
| "epoch": 0.000472, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0027347116265445948, | |
| "kl": 0.06752120889723301, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0058, | |
| "num_tokens": 20882249.0, | |
| "reward": 0.37335968017578125, | |
| "reward_std": 0.46674516797065735, | |
| "rewards/reward_func/mean": 0.37335968017578125, | |
| "rewards/reward_func/std": 0.46674516797065735, | |
| "sampling/importance_sampling_ratio/max": 1.1944468021392822, | |
| "sampling/importance_sampling_ratio/mean": 0.8154112100601196, | |
| "sampling/importance_sampling_ratio/min": 3.1055763429627126e-12, | |
| "sampling/sampling_logp_difference/max": 2.516141891479492, | |
| "sampling/sampling_logp_difference/mean": 0.005523150786757469, | |
| "step": 59, | |
| "step_time": 243.8482750041876 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0070169707760214806, | |
| "clip_ratio/high_mean": 0.0008790283463895321, | |
| "clip_ratio/low_mean": 0.011722586234100163, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.012601614958839491, | |
| "entropy": 0.19912216998636723, | |
| "epoch": 0.00048, | |
| "grad_norm": 0.004964805673807859, | |
| "kl": 0.3173718089237809, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0111, | |
| "step": 60, | |
| "step_time": 67.48408747394569 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.003293595160357654, | |
| "clip_ratio/high_mean": 0.00041169939504470676, | |
| "clip_ratio/low_mean": 0.001325315679423511, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.001737015089020133, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15692.0, | |
| "completions/mean_length": 3332.625, | |
| "completions/mean_terminated_length": 1196.9454345703125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.20970237255096436, | |
| "epoch": 0.000488, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.0021050143986940384, | |
| "kl": 0.22093774378299713, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0007, | |
| "num_tokens": 21614089.0, | |
| "reward": 0.39702850580215454, | |
| "reward_std": 0.48266974091529846, | |
| "rewards/reward_func/mean": 0.39702850580215454, | |
| "rewards/reward_func/std": 0.48266977071762085, | |
| "sampling/importance_sampling_ratio/max": 1.2085174322128296, | |
| "sampling/importance_sampling_ratio/mean": 0.7436270713806152, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.35117506980896, | |
| "sampling/sampling_logp_difference/mean": 0.009538266807794571, | |
| "step": 61, | |
| "step_time": 356.0667571427766 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0040483163320459425, | |
| "clip_ratio/high_mean": 0.0008040911634452641, | |
| "clip_ratio/low_mean": 0.000619270489551127, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0014233616821002215, | |
| "entropy": 0.2903680093586445, | |
| "epoch": 0.000496, | |
| "grad_norm": 0.0013163138646632433, | |
| "kl": 0.22006989642977715, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 62, | |
| "step_time": 120.53421806404367 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04629576357547194, | |
| "clip_ratio/high_mean": 0.006143865539343096, | |
| "clip_ratio/low_mean": 0.00034697772935032845, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006490843268693425, | |
| "completions/clipped_ratio": 0.1875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 5936.0, | |
| "completions/mean_length": 3316.71875, | |
| "completions/mean_terminated_length": 301.19232177734375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.11050131916999817, | |
| "epoch": 0.000504, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.011926773004233837, | |
| "kl": 0.7313324622809887, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0115, | |
| "num_tokens": 22353821.0, | |
| "reward": 0.326221227645874, | |
| "reward_std": 0.46681538224220276, | |
| "rewards/reward_func/mean": 0.326221227645874, | |
| "rewards/reward_func/std": 0.46681535243988037, | |
| "sampling/importance_sampling_ratio/max": 1.7655757665634155, | |
| "sampling/importance_sampling_ratio/mean": 0.7426654100418091, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.8568098545074463, | |
| "sampling/sampling_logp_difference/mean": 0.006891004741191864, | |
| "step": 63, | |
| "step_time": 302.78563037491404 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04654776549432427, | |
| "clip_ratio/high_mean": 0.0064325097628170624, | |
| "clip_ratio/low_mean": 0.0002609335570014082, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.006693443312542513, | |
| "entropy": 0.2221522331237793, | |
| "epoch": 0.000512, | |
| "grad_norm": 0.00616535684093833, | |
| "kl": 0.31134266406297684, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0099, | |
| "step": 64, | |
| "step_time": 98.32436606986448 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.002486778888851404, | |
| "clip_ratio/high_mean": 0.0003108473611064255, | |
| "clip_ratio/low_mean": 0.004113847695407458, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004424695056513883, | |
| "completions/clipped_ratio": 0.0703125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 12115.0, | |
| "completions/mean_length": 1701.78125, | |
| "completions/mean_terminated_length": 591.3613891601562, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.21395106986165047, | |
| "epoch": 0.00052, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0024383016861975193, | |
| "kl": 0.6399696841835976, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0004, | |
| "num_tokens": 23079545.0, | |
| "reward": 0.5035587549209595, | |
| "reward_std": 0.49234020709991455, | |
| "rewards/reward_func/mean": 0.5035587549209595, | |
| "rewards/reward_func/std": 0.49234020709991455, | |
| "sampling/importance_sampling_ratio/max": 2.8838155269622803, | |
| "sampling/importance_sampling_ratio/mean": 0.9122731685638428, | |
| "sampling/importance_sampling_ratio/min": 1.5084187154554285e-12, | |
| "sampling/sampling_logp_difference/max": 1.4747650623321533, | |
| "sampling/sampling_logp_difference/mean": 0.008541534654796124, | |
| "step": 65, | |
| "step_time": 420.5027240368072 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.006610034382902086, | |
| "clip_ratio/high_mean": 0.0008262542978627607, | |
| "clip_ratio/low_mean": 0.025520833674818277, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.026347088234615512, | |
| "entropy": 0.21546945348381996, | |
| "epoch": 0.000528, | |
| "grad_norm": 0.002588092116639018, | |
| "kl": 0.3157913535833359, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0018, | |
| "step": 66, | |
| "step_time": 150.25083671603352 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00305051077157259, | |
| "clip_ratio/high_mean": 0.0004194560169707984, | |
| "clip_ratio/low_mean": 0.0010861777554964647, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0015056337579153478, | |
| "completions/clipped_ratio": 0.03125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 11704.0, | |
| "completions/mean_length": 1241.578125, | |
| "completions/mean_terminated_length": 753.1128540039062, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.34802427887916565, | |
| "epoch": 0.000536, | |
| "frac_reward_zero_std": 0.1875, | |
| "grad_norm": 0.010660664178431034, | |
| "kl": 0.3574690632522106, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0188, | |
| "num_tokens": 23671939.0, | |
| "reward": 0.30863311886787415, | |
| "reward_std": 0.4487622380256653, | |
| "rewards/reward_func/mean": 0.30863311886787415, | |
| "rewards/reward_func/std": 0.4487622380256653, | |
| "sampling/importance_sampling_ratio/max": 1.2251367568969727, | |
| "sampling/importance_sampling_ratio/mean": 0.8774986267089844, | |
| "sampling/importance_sampling_ratio/min": 5.643987203594533e-15, | |
| "sampling/sampling_logp_difference/max": 2.2548747062683105, | |
| "sampling/sampling_logp_difference/mean": 0.01900642365217209, | |
| "step": 67, | |
| "step_time": 272.54457034613006 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04360994976013899, | |
| "clip_ratio/high_mean": 0.0056244394509121776, | |
| "clip_ratio/low_mean": 0.019384152255952358, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.025008591823279858, | |
| "entropy": 0.30202219262719154, | |
| "epoch": 0.000544, | |
| "grad_norm": 0.0031096329912543297, | |
| "kl": 0.19528233632445335, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0002, | |
| "step": 68, | |
| "step_time": 87.20590779092163 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 7.0, | |
| "completions/mean_length": 1026.96875, | |
| "completions/mean_terminated_length": 3.1666667461395264, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.360334113240242, | |
| "epoch": 0.000552, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.004637387115508318, | |
| "kl": 0.21597419865429401, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0001, | |
| "num_tokens": 24378991.0, | |
| "reward": 0.2120947688817978, | |
| "reward_std": 0.3439648747444153, | |
| "rewards/reward_func/mean": 0.2120947688817978, | |
| "rewards/reward_func/std": 0.3439648747444153, | |
| "sampling/importance_sampling_ratio/max": 1.2173486948013306, | |
| "sampling/importance_sampling_ratio/mean": 0.9408060908317566, | |
| "sampling/importance_sampling_ratio/min": 6.941108278424313e-12, | |
| "sampling/sampling_logp_difference/max": 2.992739677429199, | |
| "sampling/sampling_logp_difference/mean": 0.01907212659716606, | |
| "step": 69, | |
| "step_time": 413.641770795919 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.22500000521540642, | |
| "clip_ratio/high_mean": 0.033333334140479565, | |
| "clip_ratio/low_mean": 0.01145833358168602, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04479166818782687, | |
| "entropy": 0.43712426722049713, | |
| "epoch": 0.00056, | |
| "grad_norm": 0.0043171476572752, | |
| "kl": 0.2573888264596462, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0001, | |
| "step": 70, | |
| "step_time": 149.37348693376407 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.011275041149929166, | |
| "clip_ratio/high_mean": 0.0016421006293967366, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0016421006293967366, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15718.0, | |
| "completions/mean_length": 1101.6171875, | |
| "completions/mean_terminated_length": 981.283447265625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.24557911232113838, | |
| "epoch": 0.000568, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0067433081567287445, | |
| "kl": 0.1323122438043356, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0056, | |
| "num_tokens": 24892926.0, | |
| "reward": 0.5961877107620239, | |
| "reward_std": 0.48414939641952515, | |
| "rewards/reward_func/mean": 0.5961877107620239, | |
| "rewards/reward_func/std": 0.48414939641952515, | |
| "sampling/importance_sampling_ratio/max": 2.302067279815674, | |
| "sampling/importance_sampling_ratio/mean": 0.8856508731842041, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.4530627727508545, | |
| "sampling/sampling_logp_difference/mean": 0.014587011188268661, | |
| "step": 71, | |
| "step_time": 211.82522862195037 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05322292904020287, | |
| "clip_ratio/high_mean": 0.007361717482126551, | |
| "clip_ratio/low_mean": 0.0005052025571785634, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.007866920397646027, | |
| "entropy": 0.24957521632313728, | |
| "epoch": 0.000576, | |
| "grad_norm": 0.03510262817144394, | |
| "kl": 0.10960768908262253, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0024, | |
| "step": 72, | |
| "step_time": 55.394755602115765 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04315747210057452, | |
| "clip_ratio/high_mean": 0.005394684012571815, | |
| "clip_ratio/low_mean": 0.01105505934174289, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01644974334340077, | |
| "completions/clipped_ratio": 0.0390625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14039.0, | |
| "completions/mean_length": 1335.3359375, | |
| "completions/mean_terminated_length": 723.6016235351562, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.24440882354974747, | |
| "epoch": 0.000584, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.003040608251467347, | |
| "kl": 0.3741700351238251, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0027, | |
| "num_tokens": 25583089.0, | |
| "reward": 0.5570250153541565, | |
| "reward_std": 0.4779793322086334, | |
| "rewards/reward_func/mean": 0.5570250153541565, | |
| "rewards/reward_func/std": 0.4779793322086334, | |
| "sampling/importance_sampling_ratio/max": 1.250884771347046, | |
| "sampling/importance_sampling_ratio/mean": 0.888532280921936, | |
| "sampling/importance_sampling_ratio/min": 2.9074090690528465e-08, | |
| "sampling/sampling_logp_difference/max": 1.549929141998291, | |
| "sampling/sampling_logp_difference/mean": 0.010362871922552586, | |
| "step": 73, | |
| "step_time": 457.1430780822411 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.01055803267081501, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01055803267081501, | |
| "entropy": 0.20154350250959396, | |
| "epoch": 0.000592, | |
| "grad_norm": 0.002142983488738537, | |
| "kl": 0.14010655879974365, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0032, | |
| "step": 74, | |
| "step_time": 180.0332786256913 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.003929472557501867, | |
| "clip_ratio/high_mean": 0.000681739784340607, | |
| "clip_ratio/low_mean": 0.000918249599635601, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0015999893948901445, | |
| "completions/clipped_ratio": 0.171875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 13287.0, | |
| "completions/mean_length": 3697.3046875, | |
| "completions/mean_terminated_length": 1064.217041015625, | |
| "completions/min_length": 3.0, | |
| "completions/min_terminated_length": 3.0, | |
| "entropy": 0.22590620815753937, | |
| "epoch": 0.0006, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.003586079925298691, | |
| "kl": 0.14618558436632156, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0088, | |
| "num_tokens": 26577264.0, | |
| "reward": 0.3558464050292969, | |
| "reward_std": 0.4764367341995239, | |
| "rewards/reward_func/mean": 0.3558464050292969, | |
| "rewards/reward_func/std": 0.4764367640018463, | |
| "sampling/importance_sampling_ratio/max": 1.2136788368225098, | |
| "sampling/importance_sampling_ratio/mean": 0.7275122404098511, | |
| "sampling/importance_sampling_ratio/min": 1.4362665263063827e-19, | |
| "sampling/sampling_logp_difference/max": 1.6977732181549072, | |
| "sampling/sampling_logp_difference/mean": 0.013559934683144093, | |
| "step": 75, | |
| "step_time": 279.72021683468483 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.03983482558396645, | |
| "clip_ratio/high_mean": 0.005230471204413334, | |
| "clip_ratio/low_mean": 0.004741664102766663, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009972134721465409, | |
| "entropy": 0.20037926360964775, | |
| "epoch": 0.000608, | |
| "grad_norm": 0.0038128597661852837, | |
| "kl": 0.12694548070430756, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0052, | |
| "step": 76, | |
| "step_time": 75.0094793732278 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0030184224306140095, | |
| "clip_ratio/high_mean": 0.0003773028038267512, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003773028038267512, | |
| "completions/clipped_ratio": 0.140625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 13004.0, | |
| "completions/mean_length": 2918.640625, | |
| "completions/mean_terminated_length": 715.2181396484375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.2610139548778534, | |
| "epoch": 0.000616, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0017114380607381463, | |
| "kl": 0.35155298560857773, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0019, | |
| "num_tokens": 27487122.0, | |
| "reward": 0.37231287360191345, | |
| "reward_std": 0.451972633600235, | |
| "rewards/reward_func/mean": 0.37231287360191345, | |
| "rewards/reward_func/std": 0.451972633600235, | |
| "sampling/importance_sampling_ratio/max": 1.72040593624115, | |
| "sampling/importance_sampling_ratio/mean": 0.8099797964096069, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.51240873336792, | |
| "sampling/sampling_logp_difference/mean": 0.01623906008899212, | |
| "step": 77, | |
| "step_time": 270.05139491008595 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04338416282553226, | |
| "clip_ratio/high_mean": 0.00988730626704637, | |
| "clip_ratio/low_mean": 0.014765097017516382, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.024652403328218497, | |
| "entropy": 0.31988539546728134, | |
| "epoch": 0.000624, | |
| "grad_norm": 0.01336484681814909, | |
| "kl": 0.2897513546049595, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0337, | |
| "step": 78, | |
| "step_time": 79.79506105207838 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.05000000074505806, | |
| "clip_ratio/high_mean": 0.0062500000931322575, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01145833358168602, | |
| "completions/clipped_ratio": 0.125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 1118.0, | |
| "completions/mean_length": 2130.7109375, | |
| "completions/mean_terminated_length": 94.52678680419922, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.2616175599396229, | |
| "epoch": 0.000632, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.003293546847999096, | |
| "kl": 0.34126188047230244, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0058, | |
| "num_tokens": 28247557.0, | |
| "reward": 0.42832934856414795, | |
| "reward_std": 0.45818737149238586, | |
| "rewards/reward_func/mean": 0.42832934856414795, | |
| "rewards/reward_func/std": 0.45818737149238586, | |
| "sampling/importance_sampling_ratio/max": 2.1360301971435547, | |
| "sampling/importance_sampling_ratio/mean": 0.9051436185836792, | |
| "sampling/importance_sampling_ratio/min": 2.526717501893927e-09, | |
| "sampling/sampling_logp_difference/max": 2.0171802043914795, | |
| "sampling/sampling_logp_difference/mean": 0.017018210142850876, | |
| "step": 79, | |
| "step_time": 285.03405929682776 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.22500000521540642, | |
| "clip_ratio/high_mean": 0.03081387374550104, | |
| "clip_ratio/low_mean": 0.01876397612886649, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.04957784991711378, | |
| "entropy": 0.2775324620306492, | |
| "epoch": 0.00064, | |
| "grad_norm": 0.003191626165062189, | |
| "kl": 0.21329555287957191, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0083, | |
| "step": 80, | |
| "step_time": 82.6638121791184 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0008650519303046167, | |
| "clip_ratio/high_mean": 0.00010813149128807709, | |
| "clip_ratio/low_mean": 0.0, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00010813149128807709, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 1379.0, | |
| "completions/mean_length": 1149.046875, | |
| "completions/mean_terminated_length": 133.3833465576172, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.17147246748209, | |
| "epoch": 0.000648, | |
| "frac_reward_zero_std": 0.5, | |
| "grad_norm": 0.0028895766008645296, | |
| "kl": 0.09206334501504898, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "num_tokens": 28968019.0, | |
| "reward": 0.5178102254867554, | |
| "reward_std": 0.47222229838371277, | |
| "rewards/reward_func/mean": 0.5178102254867554, | |
| "rewards/reward_func/std": 0.47222229838371277, | |
| "sampling/importance_sampling_ratio/max": 1.396835446357727, | |
| "sampling/importance_sampling_ratio/mean": 0.9524275064468384, | |
| "sampling/importance_sampling_ratio/min": 1.0122628737008199e-05, | |
| "sampling/sampling_logp_difference/max": 1.3480243682861328, | |
| "sampling/sampling_logp_difference/mean": 0.026343410834670067, | |
| "step": 81, | |
| "step_time": 286.13709013699554 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.09256594924954697, | |
| "clip_ratio/high_mean": 0.021243362592940684, | |
| "clip_ratio/low_mean": 0.01618036488071084, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.037423727568238974, | |
| "entropy": 0.19774584844708443, | |
| "epoch": 0.000656, | |
| "grad_norm": 0.003909197635948658, | |
| "kl": 0.20639685168862343, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0001, | |
| "step": 82, | |
| "step_time": 81.07379846903495 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0021464216988533735, | |
| "clip_ratio/high_mean": 0.0002683027123566717, | |
| "clip_ratio/low_mean": 0.005357950474717654, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0056262532161781564, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 3288.0, | |
| "completions/max_terminated_length": 3288.0, | |
| "completions/mean_length": 196.8671875, | |
| "completions/mean_terminated_length": 196.8671875, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.3457997739315033, | |
| "epoch": 0.000664, | |
| "frac_reward_zero_std": 0.3125, | |
| "grad_norm": 0.022016355767846107, | |
| "kl": 0.4088924489915371, | |
| "learning_rate": 0.0001, | |
| "loss": -0.007, | |
| "num_tokens": 29420586.0, | |
| "reward": 0.37082305550575256, | |
| "reward_std": 0.46140459179878235, | |
| "rewards/reward_func/mean": 0.37082305550575256, | |
| "rewards/reward_func/std": 0.46140459179878235, | |
| "sampling/importance_sampling_ratio/max": 1.4103598594665527, | |
| "sampling/importance_sampling_ratio/mean": 0.9493687748908997, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.0998687744140625, | |
| "sampling/sampling_logp_difference/mean": 0.023627880960702896, | |
| "step": 83, | |
| "step_time": 57.941960010211915 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1273603499867022, | |
| "clip_ratio/high_mean": 0.016455542587209493, | |
| "clip_ratio/low_mean": 0.06458333507180214, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.08103887923061848, | |
| "entropy": 0.282495453953743, | |
| "epoch": 0.000672, | |
| "grad_norm": 0.021933559328317642, | |
| "kl": 0.2802862487733364, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0041, | |
| "step": 84, | |
| "step_time": 18.895374842220917 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0016916769818635657, | |
| "clip_ratio/high_mean": 0.00021145962273294572, | |
| "clip_ratio/low_mean": 0.0004629129107343033, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0006743725316482596, | |
| "completions/clipped_ratio": 0.0625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 8302.0, | |
| "completions/mean_length": 1417.296875, | |
| "completions/mean_terminated_length": 419.5166931152344, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.19242028519511223, | |
| "epoch": 0.00068, | |
| "frac_reward_zero_std": 0.25, | |
| "grad_norm": 0.004599843639880419, | |
| "kl": 0.10945684090256691, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0015, | |
| "num_tokens": 30223584.0, | |
| "reward": 0.38827669620513916, | |
| "reward_std": 0.4383874535560608, | |
| "rewards/reward_func/mean": 0.38827669620513916, | |
| "rewards/reward_func/std": 0.4383874237537384, | |
| "sampling/importance_sampling_ratio/max": 1.2852191925048828, | |
| "sampling/importance_sampling_ratio/mean": 0.8869220018386841, | |
| "sampling/importance_sampling_ratio/min": 6.8003160436092e-08, | |
| "sampling/sampling_logp_difference/max": 3.0428004264831543, | |
| "sampling/sampling_logp_difference/mean": 0.011553528718650341, | |
| "step": 85, | |
| "step_time": 401.25821340084076 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08453038916923106, | |
| "clip_ratio/high_mean": 0.01064000147744082, | |
| "clip_ratio/low_mean": 0.041717116328072734, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.052357119115185924, | |
| "entropy": 0.28386014327406883, | |
| "epoch": 0.000688, | |
| "grad_norm": 0.008475115522742271, | |
| "kl": 0.24625534750521183, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0125, | |
| "step": 86, | |
| "step_time": 140.54234700393863 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.025748229207238182, | |
| "clip_ratio/high_mean": 0.0034044762833218556, | |
| "clip_ratio/low_mean": 0.0008440050805802457, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00424848121474497, | |
| "completions/clipped_ratio": 0.0859375, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14449.0, | |
| "completions/mean_length": 2038.8046875, | |
| "completions/mean_terminated_length": 690.1111450195312, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.17366936802864075, | |
| "epoch": 0.000696, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.004945417400449514, | |
| "kl": 0.21483153477311134, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0155, | |
| "num_tokens": 30996463.0, | |
| "reward": 0.569595456123352, | |
| "reward_std": 0.4689222276210785, | |
| "rewards/reward_func/mean": 0.569595456123352, | |
| "rewards/reward_func/std": 0.4689222574234009, | |
| "sampling/importance_sampling_ratio/max": 1.5022563934326172, | |
| "sampling/importance_sampling_ratio/mean": 0.8090132474899292, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.836258888244629, | |
| "sampling/sampling_logp_difference/mean": 0.012665043585002422, | |
| "step": 87, | |
| "step_time": 420.3839778539259 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.07500000111758709, | |
| "clip_ratio/high_mean": 0.009656705195084214, | |
| "clip_ratio/low_mean": 0.021538077868171968, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.03119478444568813, | |
| "entropy": 0.14676123298704624, | |
| "epoch": 0.000704, | |
| "grad_norm": 0.0015365808503702283, | |
| "kl": 0.2069963738322258, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0006, | |
| "step": 88, | |
| "step_time": 159.39437931077555 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0, | |
| "clip_ratio/high_mean": 0.0, | |
| "clip_ratio/low_mean": 0.0052083334885537624, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0052083334885537624, | |
| "completions/clipped_ratio": 0.0, | |
| "completions/max_length": 2818.0, | |
| "completions/max_terminated_length": 2818.0, | |
| "completions/mean_length": 249.2734375, | |
| "completions/mean_terminated_length": 249.2734375, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.19034305587410927, | |
| "epoch": 0.000712, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.013274043798446655, | |
| "kl": 0.10685652680695057, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0003, | |
| "num_tokens": 31367970.0, | |
| "reward": 0.5855048894882202, | |
| "reward_std": 0.4757256805896759, | |
| "rewards/reward_func/mean": 0.5855048894882202, | |
| "rewards/reward_func/std": 0.4757256805896759, | |
| "sampling/importance_sampling_ratio/max": 2.164741277694702, | |
| "sampling/importance_sampling_ratio/mean": 0.9093552827835083, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 0.9638514518737793, | |
| "sampling/sampling_logp_difference/mean": 0.010461562313139439, | |
| "step": 89, | |
| "step_time": 45.6294292754028 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1666666716337204, | |
| "clip_ratio/high_mean": 0.031250000931322575, | |
| "clip_ratio/low_mean": 0.02095832316626911, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.05220832256600261, | |
| "entropy": 0.16832438856363297, | |
| "epoch": 0.00072, | |
| "grad_norm": 0.013677907176315784, | |
| "kl": 0.21795203164219856, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0003, | |
| "step": 90, | |
| "step_time": 14.39668608084321 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0016864245990291238, | |
| "clip_ratio/high_mean": 0.00021080307487864047, | |
| "clip_ratio/low_mean": 9.959549061022699e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.00031039856548886746, | |
| "completions/clipped_ratio": 0.0703125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 14607.0, | |
| "completions/mean_length": 2265.78125, | |
| "completions/mean_terminated_length": 1198.016845703125, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.3109714537858963, | |
| "epoch": 0.000728, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.0026221030857414007, | |
| "kl": 0.29385947436094284, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0016, | |
| "num_tokens": 31978134.0, | |
| "reward": 0.41071587800979614, | |
| "reward_std": 0.48560255765914917, | |
| "rewards/reward_func/mean": 0.41071587800979614, | |
| "rewards/reward_func/std": 0.48560255765914917, | |
| "sampling/importance_sampling_ratio/max": 2.1744186878204346, | |
| "sampling/importance_sampling_ratio/mean": 0.7339906692504883, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.330114483833313, | |
| "sampling/sampling_logp_difference/mean": 0.01747949793934822, | |
| "step": 91, | |
| "step_time": 224.5670603781473 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0416666679084301, | |
| "clip_ratio/high_mean": 0.00536864111199975, | |
| "clip_ratio/low_mean": 0.01056993727979716, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01593857839179691, | |
| "entropy": 0.3097205422818661, | |
| "epoch": 0.000736, | |
| "grad_norm": 0.0013873938005417585, | |
| "kl": 0.21971550025045872, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 92, | |
| "step_time": 41.30870744702406 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0018048831261694431, | |
| "clip_ratio/high_mean": 0.00023677908757235855, | |
| "clip_ratio/low_mean": 0.021204495129495626, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.021441274860990234, | |
| "completions/clipped_ratio": 0.1796875, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 8798.0, | |
| "completions/mean_length": 3474.4140625, | |
| "completions/mean_terminated_length": 646.6000366210938, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.16217100247740746, | |
| "epoch": 0.000744, | |
| "frac_reward_zero_std": 0.375, | |
| "grad_norm": 0.00539315864443779, | |
| "kl": 0.22048377990722656, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0072, | |
| "num_tokens": 33019531.0, | |
| "reward": 0.2454037368297577, | |
| "reward_std": 0.39766862988471985, | |
| "rewards/reward_func/mean": 0.2454037368297577, | |
| "rewards/reward_func/std": 0.3976685702800751, | |
| "sampling/importance_sampling_ratio/max": 2.8269150257110596, | |
| "sampling/importance_sampling_ratio/mean": 0.7975430488586426, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.909695863723755, | |
| "sampling/sampling_logp_difference/mean": 0.007487665396183729, | |
| "step": 93, | |
| "step_time": 474.8481697048992 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.04168192390443437, | |
| "clip_ratio/high_mean": 0.005210240488054296, | |
| "clip_ratio/low_mean": 0.0055653811286902055, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.01077562254795339, | |
| "entropy": 0.15040505304932594, | |
| "epoch": 0.000752, | |
| "grad_norm": 0.008554578758776188, | |
| "kl": 0.21248403005301952, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0569, | |
| "step": 94, | |
| "step_time": 175.67941991216503 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.004689611494541168, | |
| "clip_ratio/high_mean": 0.0007652845233678818, | |
| "clip_ratio/low_mean": 0.00020595593377947807, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0009712404571473598, | |
| "completions/clipped_ratio": 0.0703125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 15830.0, | |
| "completions/mean_length": 1703.2734375, | |
| "completions/mean_terminated_length": 592.9664306640625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.11936141178011894, | |
| "epoch": 0.00076, | |
| "frac_reward_zero_std": 0.5625, | |
| "grad_norm": 0.007037501782178879, | |
| "kl": 0.15314552932977676, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0121, | |
| "num_tokens": 33616110.0, | |
| "reward": 0.5862494707107544, | |
| "reward_std": 0.4870387017726898, | |
| "rewards/reward_func/mean": 0.5862494707107544, | |
| "rewards/reward_func/std": 0.4870387017726898, | |
| "sampling/importance_sampling_ratio/max": 1.921995997428894, | |
| "sampling/importance_sampling_ratio/mean": 0.8520303964614868, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.492267608642578, | |
| "sampling/sampling_logp_difference/mean": 0.0060460735112428665, | |
| "step": 95, | |
| "step_time": 279.93328415811993 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.1648085294291377, | |
| "clip_ratio/high_mean": 0.027523898315848783, | |
| "clip_ratio/low_mean": 0.00035762626794166863, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.02788152452558279, | |
| "entropy": 0.11354503780603409, | |
| "epoch": 0.000768, | |
| "grad_norm": 0.007192930206656456, | |
| "kl": 0.07495404127985239, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0076, | |
| "step": 96, | |
| "step_time": 81.44801545701921 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0718371415277943, | |
| "clip_ratio/high_mean": 0.009176566891255789, | |
| "clip_ratio/low_mean": 0.0004482567746890709, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.009624823651392944, | |
| "completions/clipped_ratio": 0.1640625, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 12879.0, | |
| "completions/mean_length": 3361.4921875, | |
| "completions/mean_terminated_length": 805.6728515625, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.09433631971478462, | |
| "epoch": 0.000776, | |
| "frac_reward_zero_std": 0.625, | |
| "grad_norm": 0.006469857878983021, | |
| "kl": 0.06316580064594746, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0171, | |
| "num_tokens": 34458349.0, | |
| "reward": 0.3976612091064453, | |
| "reward_std": 0.47885704040527344, | |
| "rewards/reward_func/mean": 0.3976612091064453, | |
| "rewards/reward_func/std": 0.47885704040527344, | |
| "sampling/importance_sampling_ratio/max": 1.6287328004837036, | |
| "sampling/importance_sampling_ratio/mean": 0.7641345262527466, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 2.092411518096924, | |
| "sampling/sampling_logp_difference/mean": 0.0073972526006400585, | |
| "step": 97, | |
| "step_time": 312.0833521957975 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.0019895988516509533, | |
| "clip_ratio/high_mean": 0.000492683844640851, | |
| "clip_ratio/low_mean": 0.004362157778814435, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.004854841623455286, | |
| "entropy": 0.11002197489142418, | |
| "epoch": 0.000784, | |
| "grad_norm": 0.0009743753471411765, | |
| "kl": 0.06945361755788326, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0, | |
| "step": 98, | |
| "step_time": 100.79940819228068 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.00242789089679718, | |
| "clip_ratio/high_mean": 0.00032243724854197353, | |
| "clip_ratio/low_mean": 3.6235200241208076e-05, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.0003586724487831816, | |
| "completions/clipped_ratio": 0.0078125, | |
| "completions/max_length": 16384.0, | |
| "completions/max_terminated_length": 9493.0, | |
| "completions/mean_length": 656.71875, | |
| "completions/mean_terminated_length": 532.8818969726562, | |
| "completions/min_length": 2.0, | |
| "completions/min_terminated_length": 2.0, | |
| "entropy": 0.12588375620543957, | |
| "epoch": 0.000792, | |
| "frac_reward_zero_std": 0.75, | |
| "grad_norm": 0.024520058184862137, | |
| "kl": 0.1035240120254457, | |
| "learning_rate": 0.0001, | |
| "loss": 0.0842, | |
| "num_tokens": 34860025.0, | |
| "reward": 0.5634695887565613, | |
| "reward_std": 0.4744718670845032, | |
| "rewards/reward_func/mean": 0.5634695887565613, | |
| "rewards/reward_func/std": 0.4744718670845032, | |
| "sampling/importance_sampling_ratio/max": 1.7210280895233154, | |
| "sampling/importance_sampling_ratio/mean": 0.9508634805679321, | |
| "sampling/importance_sampling_ratio/min": 0.0, | |
| "sampling/sampling_logp_difference/max": 1.2289901971817017, | |
| "sampling/sampling_logp_difference/mean": 0.006527569144964218, | |
| "step": 99, | |
| "step_time": 239.3775063320063 | |
| }, | |
| { | |
| "clip_ratio/high_max": 0.08591295027872548, | |
| "clip_ratio/high_mean": 0.011129089194582775, | |
| "clip_ratio/low_mean": 0.005252894119621487, | |
| "clip_ratio/low_min": 0.0, | |
| "clip_ratio/region_mean": 0.016381983092287555, | |
| "entropy": 0.11453884467482567, | |
| "epoch": 0.0008, | |
| "grad_norm": 0.01832975633442402, | |
| "kl": 0.032136627938598394, | |
| "learning_rate": 0.0001, | |
| "loss": -0.0691, | |
| "step": 100, | |
| "step_time": 78.3447534351144 | |
| } | |
| ], | |
| "logging_steps": 1, | |
| "max_steps": 10000, | |
| "num_input_tokens_seen": 34860025, | |
| "num_train_epochs": 1, | |
| "save_steps": 50, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 0.0, | |
| "train_batch_size": 2, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |