{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.7389162561576355, "eval_steps": 500, "global_step": 300, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3836.0, "completions/mean_length": 1083.9375, "completions/mean_terminated_length": 1047.7213134765625, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6694833338260651, "epoch": 0.0024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.007095558031024415, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0409415178000927, "num_tokens": 155852.0, "reward": 0.76171875, "reward_std": 0.7998383641242981, "rewards/reward_func/mean": 0.08463541666666667, "rewards/reward_func/std": 0.12621609369913736, "sampling/importance_sampling_ratio/max": 2.9962944984436035, "sampling/importance_sampling_ratio/mean": 0.9515448808670044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.445388793945312, "sampling/sampling_logp_difference/mean": 0.18421649932861328, "step": 1, "step_time": 221.00888453796506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4012.0, "completions/mean_length": 896.4375, "completions/mean_terminated_length": 855.8709716796875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6748273521661758, "epoch": 0.0049261083743842365, "frac_reward_zero_std": 0.0, "grad_norm": 0.006613421003843556, "kl": 0.0, "learning_rate": 1e-05, "loss": -0.01135256141424179, "num_tokens": 304376.0, "reward": 0.77734375, "reward_std": 0.450126051902771, "rewards/reward_func/mean": 0.08637152777777778, "rewards/reward_func/std": 0.06551425324545966, "sampling/importance_sampling_ratio/max": 2.9991252422332764, "sampling/importance_sampling_ratio/mean": 0.9491186738014221, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.011090278625488, "sampling/sampling_logp_difference/mean": 0.19341453909873962, "step": 2, "step_time": 124.78918863995932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 1291.609375, "completions/mean_terminated_length": 1131.1500244140625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6219745129346848, "epoch": 0.007389162561576354, "frac_reward_zero_std": 0.0, "grad_norm": 0.005562883860396661, "kl": 0.00021881231805309653, "learning_rate": 2e-05, "loss": 0.017589552327990532, "num_tokens": 466207.0, "reward": 0.8359375, "reward_std": 0.6223654747009277, "rewards/reward_func/mean": 0.09288194444444445, "rewards/reward_func/std": 0.08707591229014927, "sampling/importance_sampling_ratio/max": 2.998457431793213, "sampling/importance_sampling_ratio/mean": 0.9518132209777832, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.11111831665039, "sampling/sampling_logp_difference/mean": 0.18696464598178864, "step": 3, "step_time": 172.1733792340383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3972.0, "completions/max_terminated_length": 3972.0, "completions/mean_length": 858.046875, "completions/mean_terminated_length": 867.2257690429688, "completions/min_length": 144.0, "completions/min_terminated_length": 158.0, "degenerate_groups_filtered": 0.0, "entropy": 0.602585643529892, "epoch": 0.009852216748768473, "frac_reward_zero_std": 0.0, "grad_norm": 0.005487564821053171, "kl": 0.00019695455193868838, "learning_rate": 3e-05, "loss": -0.042750950902700424, "num_tokens": 594530.0, "reward": 0.9296875, "reward_std": 0.4949522018432617, "rewards/reward_func/mean": 0.1032986111111111, "rewards/reward_func/std": 0.07230462630589803, "sampling/importance_sampling_ratio/max": 2.997239112854004, "sampling/importance_sampling_ratio/mean": 0.959896445274353, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.613371849060059, "sampling/sampling_logp_difference/mean": 0.1705228090286255, "step": 4, "step_time": 106.91271968232468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2055.0, "completions/mean_length": 773.546875, "completions/mean_terminated_length": 678.084716796875, "completions/min_length": 11.0, "completions/min_terminated_length": 71.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6291423887014389, "epoch": 0.012315270935960592, "frac_reward_zero_std": 0.0, "grad_norm": 0.006864817471537119, "kl": 0.00026583998987916857, "learning_rate": 4e-05, "loss": -0.02318686991930008, "num_tokens": 725013.0, "reward": 0.73046875, "reward_std": 0.6079902648925781, "rewards/reward_func/mean": 0.08116319444444445, "rewards/reward_func/std": 0.08406046364042494, "sampling/importance_sampling_ratio/max": 2.9972803592681885, "sampling/importance_sampling_ratio/mean": 0.9578667283058167, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.874061584472656, "sampling/sampling_logp_difference/mean": 0.17526790499687195, "step": 5, "step_time": 157.378293489106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2283.0, "completions/mean_length": 949.609375, "completions/mean_terminated_length": 858.4067993164062, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7411623895168304, "epoch": 0.014778325123152709, "frac_reward_zero_std": 0.0, "grad_norm": 0.00841264236101695, "kl": 0.0002813305109157227, "learning_rate": 5e-05, "loss": 0.022779636085033417, "num_tokens": 886028.0, "reward": 0.82421875, "reward_std": 0.8161072731018066, "rewards/reward_func/mean": 0.0915798611111111, "rewards/reward_func/std": 0.12717805471685198, "sampling/importance_sampling_ratio/max": 2.996835947036743, "sampling/importance_sampling_ratio/mean": 0.9440486431121826, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.687472343444824, "sampling/sampling_logp_difference/mean": 0.21788033843040466, "step": 6, "step_time": 139.1690670568496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3740.0, "completions/mean_length": 1115.5625, "completions/mean_terminated_length": 1082.360595703125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7721924930810928, "epoch": 0.017241379310344827, "frac_reward_zero_std": 0.0, "grad_norm": 0.006704431263996823, "kl": 0.0003544362625689246, "learning_rate": 4.999995293306428e-05, "loss": -0.035419315099716187, "num_tokens": 1055376.0, "reward": 0.80078125, "reward_std": 0.607786238193512, "rewards/reward_func/mean": 0.08897569444444445, "rewards/reward_func/std": 0.08487503065003289, "sampling/importance_sampling_ratio/max": 2.9902729988098145, "sampling/importance_sampling_ratio/mean": 0.9370421171188354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.689960479736328, "sampling/sampling_logp_difference/mean": 0.23172584176063538, "step": 7, "step_time": 136.04291818407364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3636.0, "completions/mean_length": 1065.609375, "completions/mean_terminated_length": 1024.2333984375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7013488560914993, "epoch": 0.019704433497536946, "frac_reward_zero_std": 0.0, "grad_norm": 0.006049884152623777, "kl": 0.00045841842802474275, "learning_rate": 4.999981173243434e-05, "loss": 0.0060663241893053055, "num_tokens": 1214471.0, "reward": 0.96484375, "reward_std": 0.7221924066543579, "rewards/reward_func/mean": 0.1072048611111111, "rewards/reward_func/std": 0.1487934175464842, "sampling/importance_sampling_ratio/max": 2.9969818592071533, "sampling/importance_sampling_ratio/mean": 0.9477940797805786, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.812170028686523, "sampling/sampling_logp_difference/mean": 0.20687559247016907, "step": 8, "step_time": 120.98894325993024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2526.0, "completions/max_terminated_length": 2526.0, "completions/mean_length": 608.6875, "completions/mean_terminated_length": 604.5556030273438, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7017630189657211, "epoch": 0.022167487684729065, "frac_reward_zero_std": 0.0, "grad_norm": 0.00539651772659363, "kl": 0.000361532969691325, "learning_rate": 4.999957639864185e-05, "loss": -0.03007565438747406, "num_tokens": 1336179.0, "reward": 0.97265625, "reward_std": 0.450126051902771, "rewards/reward_func/mean": 0.10807291666666667, "rewards/reward_func/std": 0.06657051046689351, "sampling/importance_sampling_ratio/max": 2.9878950119018555, "sampling/importance_sampling_ratio/mean": 0.958396315574646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.588211059570312, "sampling/sampling_logp_difference/mean": 0.18251243233680725, "step": 9, "step_time": 77.26975405705161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2819.0, "completions/mean_length": 1163.921875, "completions/mean_terminated_length": 968.4500732421875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7973304092884064, "epoch": 0.024630541871921183, "frac_reward_zero_std": 0.0, "grad_norm": 0.005535093066648039, "kl": 0.0025367297348566353, "learning_rate": 4.999924693257293e-05, "loss": -0.026630321517586708, "num_tokens": 1510574.0, "reward": 0.80859375, "reward_std": 0.45369336009025574, "rewards/reward_func/mean": 0.08984375, "rewards/reward_func/std": 0.06406109862857395, "sampling/importance_sampling_ratio/max": 2.997955799102783, "sampling/importance_sampling_ratio/mean": 0.9407488703727722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.1378812789917, "sampling/sampling_logp_difference/mean": 0.23701190948486328, "step": 10, "step_time": 218.45970374671742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3662.0, "completions/mean_length": 1229.671875, "completions/mean_terminated_length": 1112.7626953125, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "degenerate_groups_filtered": 0.0, "entropy": 0.60106560587883, "epoch": 0.027093596059113302, "frac_reward_zero_std": 0.0, "grad_norm": 0.003343557750625506, "kl": 0.0005505645822267979, "learning_rate": 4.9998823335468127e-05, "loss": -0.02195553667843342, "num_tokens": 1673273.0, "reward": 0.80078125, "reward_std": 0.37580519914627075, "rewards/reward_func/mean": 0.08897569444444445, "rewards/reward_func/std": 0.041756133238474526, "sampling/importance_sampling_ratio/max": 2.9994499683380127, "sampling/importance_sampling_ratio/mean": 0.9576238393783569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.029350280761719, "sampling/sampling_logp_difference/mean": 0.17449143528938293, "step": 11, "step_time": 112.48419295996428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3348.0, "completions/max_terminated_length": 3348.0, "completions/mean_length": 763.203125, "completions/mean_terminated_length": 763.203125, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7593114227056503, "epoch": 0.029556650246305417, "frac_reward_zero_std": 0.25, "grad_norm": 0.004228222685487359, "kl": 0.0008225990168284625, "learning_rate": 4.9998305608922444e-05, "loss": -0.00977338943630457, "num_tokens": 1815814.0, "reward": 0.9375, "reward_std": 0.417855441570282, "rewards/reward_func/mean": 0.10416666666666667, "rewards/reward_func/std": 0.06210504803392622, "sampling/importance_sampling_ratio/max": 2.998997926712036, "sampling/importance_sampling_ratio/mean": 0.9500037431716919, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.969398498535156, "sampling/sampling_logp_difference/mean": 0.20405714213848114, "step": 12, "step_time": 124.43193195271306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2881.0, "completions/mean_length": 1180.875, "completions/mean_terminated_length": 1134.603271484375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7117026150226593, "epoch": 0.03201970443349754, "frac_reward_zero_std": 0.0, "grad_norm": 0.004516257159699829, "kl": 0.0005648009391734377, "learning_rate": 4.99976937548853e-05, "loss": 0.010042570531368256, "num_tokens": 1978094.0, "reward": 0.99609375, "reward_std": 0.4575039744377136, "rewards/reward_func/mean": 0.11067708333333333, "rewards/reward_func/std": 0.06885812017652723, "sampling/importance_sampling_ratio/max": 2.9992194175720215, "sampling/importance_sampling_ratio/mean": 0.94798743724823, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.557123184204102, "sampling/sampling_logp_difference/mean": 0.20858065783977509, "step": 13, "step_time": 165.92772811092436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2549.0, "completions/mean_length": 823.015625, "completions/mean_terminated_length": 772.1612548828125, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6039450019598007, "epoch": 0.034482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.025429700762270694, "kl": 0.068690528118168, "learning_rate": 4.999698777566055e-05, "loss": -0.03384635969996452, "num_tokens": 2123599.0, "reward": 1.05078125, "reward_std": 0.6380459070205688, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.11462441086769104, "sampling/importance_sampling_ratio/max": 2.9970529079437256, "sampling/importance_sampling_ratio/mean": 0.955413818359375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.131617546081543, "sampling/sampling_logp_difference/mean": 0.18180659413337708, "step": 14, "step_time": 132.856802233262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3489.0, "completions/max_terminated_length": 3489.0, "completions/mean_length": 953.5, "completions/mean_terminated_length": 953.5, "completions/min_length": 245.0, "completions/min_terminated_length": 245.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6297731846570969, "epoch": 0.03694581280788178, "frac_reward_zero_std": 0.25, "grad_norm": 0.00198453501467837, "kl": 0.000561516048037447, "learning_rate": 4.9996187673906445e-05, "loss": -0.006561854854226112, "num_tokens": 2272399.0, "reward": 1.1171875, "reward_std": 0.39833173155784607, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.0564837654431661, "sampling/importance_sampling_ratio/max": 2.999617338180542, "sampling/importance_sampling_ratio/mean": 0.9544141292572021, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.673768997192383, "sampling/sampling_logp_difference/mean": 0.18456798791885376, "step": 15, "step_time": 113.08367080404423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1192.0, "completions/mean_length": 692.640625, "completions/mean_terminated_length": 538.0327758789062, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6860306113958359, "epoch": 0.03940886699507389, "frac_reward_zero_std": 0.0, "grad_norm": 0.004792147547513456, "kl": 0.0005804097963846289, "learning_rate": 4.9995293452635664e-05, "loss": -0.0032190822530537844, "num_tokens": 2400168.0, "reward": 0.9921875, "reward_std": 0.2780628502368927, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.0443571772840288, "sampling/importance_sampling_ratio/max": 2.9948651790618896, "sampling/importance_sampling_ratio/mean": 0.9640507698059082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.868938446044922, "sampling/sampling_logp_difference/mean": 0.1751737892627716, "step": 16, "step_time": 131.08224705676548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3842.0, "completions/mean_length": 903.828125, "completions/mean_terminated_length": 853.1587524414062, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6448574513196945, "epoch": 0.04187192118226601, "frac_reward_zero_std": 0.0, "grad_norm": 0.002971407555356022, "kl": 0.0007872640417190269, "learning_rate": 4.999430511521525e-05, "loss": -0.009143276140093803, "num_tokens": 2549357.0, "reward": 0.96875, "reward_std": 0.3196600377559662, "rewards/reward_func/mean": 0.1076388888888889, "rewards/reward_func/std": 0.04835580620500776, "sampling/importance_sampling_ratio/max": 2.997056007385254, "sampling/importance_sampling_ratio/mean": 0.9508565664291382, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.473780632019043, "sampling/sampling_logp_difference/mean": 0.19167384505271912, "step": 17, "step_time": 127.22956793638878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2759.0, "completions/mean_length": 856.609375, "completions/mean_terminated_length": 812.3933715820312, "completions/min_length": 27.0, "completions/min_terminated_length": 27.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7665040791034698, "epoch": 0.04433497536945813, "frac_reward_zero_std": 0.0, "grad_norm": 0.0037058027286737227, "kl": 0.0015929393121041358, "learning_rate": 4.999322266536666e-05, "loss": -0.00527946138754487, "num_tokens": 2696964.0, "reward": 0.9921875, "reward_std": 0.3211115002632141, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.0500405298338996, "sampling/importance_sampling_ratio/max": 2.9947409629821777, "sampling/importance_sampling_ratio/mean": 0.9457135796546936, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.665953636169434, "sampling/sampling_logp_difference/mean": 0.21514487266540527, "step": 18, "step_time": 140.69508987711743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2693.0, "completions/mean_length": 992.046875, "completions/mean_terminated_length": 957.3386840820312, "completions/min_length": 40.0, "completions/min_terminated_length": 152.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7168809026479721, "epoch": 0.046798029556650245, "frac_reward_zero_std": 0.0, "grad_norm": 0.004521973839699517, "kl": 0.0006615275924559683, "learning_rate": 4.9992046107165705e-05, "loss": 0.009132737293839455, "num_tokens": 2852695.0, "reward": 1.1484375, "reward_std": 0.7801083326339722, "rewards/reward_func/mean": 0.12760416666666666, "rewards/reward_func/std": 0.12675773766305712, "sampling/importance_sampling_ratio/max": 2.998769998550415, "sampling/importance_sampling_ratio/mean": 0.947180449962616, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.349352836608887, "sampling/sampling_logp_difference/mean": 0.2125585377216339, "step": 19, "step_time": 131.2465523199644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3261.0, "completions/max_terminated_length": 3261.0, "completions/mean_length": 775.296875, "completions/mean_terminated_length": 775.296875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6376301944255829, "epoch": 0.04926108374384237, "frac_reward_zero_std": 0.25, "grad_norm": 0.0021053398882222665, "kl": 0.0007705071911914274, "learning_rate": 4.999077544504252e-05, "loss": -0.02608906291425228, "num_tokens": 2983834.0, "reward": 1.04296875, "reward_std": 0.3320053517818451, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.04976920617951287, "sampling/importance_sampling_ratio/max": 2.998075008392334, "sampling/importance_sampling_ratio/mean": 0.9615002274513245, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.996427536010742, "sampling/sampling_logp_difference/mean": 0.171400785446167, "step": 20, "step_time": 94.92565709678456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3497.0, "completions/mean_length": 946.609375, "completions/mean_terminated_length": 901.901611328125, "completions/min_length": 69.0, "completions/min_terminated_length": 145.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7160081267356873, "epoch": 0.05172413793103448, "frac_reward_zero_std": 0.25, "grad_norm": 0.0015621206257052126, "kl": 0.0006340539694065228, "learning_rate": 4.998941068378163e-05, "loss": -0.02198156528174877, "num_tokens": 3128385.0, "reward": 1.12109375, "reward_std": 0.39589208364486694, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.0562092297606998, "sampling/importance_sampling_ratio/max": 2.9852712154388428, "sampling/importance_sampling_ratio/mean": 0.9506601691246033, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.786170959472656, "sampling/sampling_logp_difference/mean": 0.19989526271820068, "step": 21, "step_time": 136.62699230923317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1973.0, "completions/max_terminated_length": 1973.0, "completions/mean_length": 604.4375, "completions/mean_terminated_length": 608.2698974609375, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6837466955184937, "epoch": 0.054187192118226604, "frac_reward_zero_std": 0.0, "grad_norm": 0.00834361275371499, "kl": 0.0009060032753041014, "learning_rate": 4.998795182852183e-05, "loss": 0.07563716918230057, "num_tokens": 3237005.0, "reward": 1.1171875, "reward_std": 0.7223318815231323, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.11290023724238078, "sampling/importance_sampling_ratio/max": 2.9993104934692383, "sampling/importance_sampling_ratio/mean": 0.9645485877990723, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.818716049194336, "sampling/sampling_logp_difference/mean": 0.16996437311172485, "step": 22, "step_time": 68.31864915997721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3720.0, "completions/mean_length": 1099.0625, "completions/mean_terminated_length": 951.6720581054688, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "degenerate_groups_filtered": 0.0, "entropy": 0.682835578918457, "epoch": 0.05665024630541872, "frac_reward_zero_std": 0.25, "grad_norm": 0.0028399998898074653, "kl": 0.0007875877781771123, "learning_rate": 4.998639888475621e-05, "loss": 0.010182402096688747, "num_tokens": 3387729.0, "reward": 1.125, "reward_std": 0.44095855951309204, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.06258171962367164, "sampling/importance_sampling_ratio/max": 2.9974377155303955, "sampling/importance_sampling_ratio/mean": 0.950944185256958, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.174954414367676, "sampling/sampling_logp_difference/mean": 0.19639119505882263, "step": 23, "step_time": 200.68762891716324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2514.0, "completions/mean_length": 722.125, "completions/mean_terminated_length": 678.241943359375, "completions/min_length": 41.0, "completions/min_terminated_length": 41.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7390763312578201, "epoch": 0.059113300492610835, "frac_reward_zero_std": 0.0, "grad_norm": 0.005278419593389374, "kl": 0.0011095789086539298, "learning_rate": 4.998475185833219e-05, "loss": 0.02633114904165268, "num_tokens": 3512121.0, "reward": 1.05859375, "reward_std": 0.7151176333427429, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.11182467308309343, "sampling/importance_sampling_ratio/max": 2.99768328666687, "sampling/importance_sampling_ratio/mean": 0.9494476318359375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.124560356140137, "sampling/sampling_logp_difference/mean": 0.213691845536232, "step": 24, "step_time": 137.3248422681354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3199.0, "completions/mean_length": 984.203125, "completions/mean_terminated_length": 831.1638793945312, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7115143835544586, "epoch": 0.06157635467980296, "frac_reward_zero_std": 0.25, "grad_norm": 0.0015712856310160254, "kl": 0.0009124837379204109, "learning_rate": 4.9983010755451386e-05, "loss": -0.00787552259862423, "num_tokens": 3655670.0, "reward": 1.1328125, "reward_std": 0.42014914751052856, "rewards/reward_func/mean": 0.12586805555555555, "rewards/reward_func/std": 0.06024486985471514, "sampling/importance_sampling_ratio/max": 2.9933128356933594, "sampling/importance_sampling_ratio/mean": 0.9579269289970398, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.12272834777832, "sampling/sampling_logp_difference/mean": 0.19365090131759644, "step": 25, "step_time": 138.73497348395176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3386.0, "completions/mean_length": 953.484375, "completions/mean_terminated_length": 884.9193115234375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7320111691951752, "epoch": 0.06403940886699508, "frac_reward_zero_std": 0.25, "grad_norm": 0.005384163960889445, "kl": 0.0012063481844961643, "learning_rate": 4.998117558266968e-05, "loss": 0.026074275374412537, "num_tokens": 3816309.0, "reward": 0.9609375, "reward_std": 0.4655146300792694, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.06866345471805996, "sampling/importance_sampling_ratio/max": 2.9988629817962646, "sampling/importance_sampling_ratio/mean": 0.9447786808013916, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.807069778442383, "sampling/sampling_logp_difference/mean": 0.21771396696567535, "step": 26, "step_time": 126.19171593617648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3648.0, "completions/mean_length": 1288.53125, "completions/mean_terminated_length": 1150.458984375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7377140372991562, "epoch": 0.0665024630541872, "frac_reward_zero_std": 0.25, "grad_norm": 0.003947554913592225, "kl": 0.0010491551220184192, "learning_rate": 4.9979246346897136e-05, "loss": -0.014140678569674492, "num_tokens": 3986487.0, "reward": 0.9609375, "reward_std": 0.430067241191864, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.06397370166248745, "sampling/importance_sampling_ratio/max": 2.999089002609253, "sampling/importance_sampling_ratio/mean": 0.9488558769226074, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.052026748657227, "sampling/sampling_logp_difference/mean": 0.20844586193561554, "step": 27, "step_time": 130.6179301950615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3662.0, "completions/max_terminated_length": 3662.0, "completions/mean_length": 585.265625, "completions/mean_terminated_length": 585.265625, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7785868942737579, "epoch": 0.06896551724137931, "frac_reward_zero_std": 0.25, "grad_norm": 0.0021403851160726214, "kl": 0.0013080878125037998, "learning_rate": 4.997722305539802e-05, "loss": 0.012006907723844051, "num_tokens": 4101032.0, "reward": 1.08203125, "reward_std": 0.3508908450603485, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.049897139271100364, "sampling/importance_sampling_ratio/max": 2.9983808994293213, "sampling/importance_sampling_ratio/mean": 0.9581422209739685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.397562026977539, "sampling/sampling_logp_difference/mean": 0.1975124478340149, "step": 28, "step_time": 109.09123803512193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3958.0, "completions/mean_length": 1360.0625, "completions/mean_terminated_length": 1271.806396484375, "completions/min_length": 425.0, "completions/min_terminated_length": 425.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7062833905220032, "epoch": 0.07142857142857142, "frac_reward_zero_std": 0.25, "grad_norm": 0.002369998916127879, "kl": 0.0010581198730506003, "learning_rate": 4.997510571579074e-05, "loss": 0.004455733112990856, "num_tokens": 4268556.0, "reward": 1.02734375, "reward_std": 0.34823018312454224, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.052287484208742775, "sampling/importance_sampling_ratio/max": 2.999220609664917, "sampling/importance_sampling_ratio/mean": 0.9467811584472656, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.120293617248535, "sampling/sampling_logp_difference/mean": 0.21002380549907684, "step": 29, "step_time": 180.742001067847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 1096.84375, "completions/mean_terminated_length": 1054.4031982421875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "degenerate_groups_filtered": 0.0, "entropy": 0.714943453669548, "epoch": 0.07389162561576355, "frac_reward_zero_std": 0.0, "grad_norm": 0.004091906592403502, "kl": 0.0014412851742235944, "learning_rate": 4.997289433604783e-05, "loss": -0.02012975513935089, "num_tokens": 4435650.0, "reward": 0.95703125, "reward_std": 0.4444425106048584, "rewards/reward_func/mean": 0.10633680555555555, "rewards/reward_func/std": 0.06608307692739698, "sampling/importance_sampling_ratio/max": 2.9989187717437744, "sampling/importance_sampling_ratio/mean": 0.9398482441902161, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.725874900817871, "sampling/sampling_logp_difference/mean": 0.23292841017246246, "step": 30, "step_time": 150.5774575888645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4083.0, "completions/mean_length": 958.859375, "completions/mean_terminated_length": 909.0635375976562, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7270680814981461, "epoch": 0.07635467980295567, "frac_reward_zero_std": 0.5, "grad_norm": 0.0040443946723161235, "kl": 0.0008169857028406113, "learning_rate": 4.997058892449591e-05, "loss": -0.010359976440668106, "num_tokens": 4589337.0, "reward": 1.05859375, "reward_std": 0.46235719323158264, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.06781361169285244, "sampling/importance_sampling_ratio/max": 2.9993908405303955, "sampling/importance_sampling_ratio/mean": 0.9479804039001465, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.141839981079102, "sampling/sampling_logp_difference/mean": 0.2137700319290161, "step": 31, "step_time": 129.58590258820914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3918.0, "completions/max_terminated_length": 3918.0, "completions/mean_length": 1016.40625, "completions/mean_terminated_length": 1019.74609375, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7047373950481415, "epoch": 0.07881773399014778, "frac_reward_zero_std": 0.75, "grad_norm": 0.002745619654401485, "kl": 0.0010187966545345262, "learning_rate": 4.99681894898157e-05, "loss": 0.007434252183884382, "num_tokens": 4751859.0, "reward": 1.125, "reward_std": 0.37796446681022644, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.05282027191585965, "sampling/importance_sampling_ratio/max": 2.9945504665374756, "sampling/importance_sampling_ratio/mean": 0.9459384679794312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.938006401062012, "sampling/sampling_logp_difference/mean": 0.20850920677185059, "step": 32, "step_time": 165.751161579974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3819.0, "completions/mean_length": 889.015625, "completions/mean_terminated_length": 785.5645141601562, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6745845079421997, "epoch": 0.0812807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.0042239766899397155, "kl": 0.002233659179182723, "learning_rate": 4.99656960410419e-05, "loss": -0.0034338748082518578, "num_tokens": 4892388.0, "reward": 1.06640625, "reward_std": 0.5180131793022156, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.0750073492527008, "sampling/importance_sampling_ratio/max": 2.9851253032684326, "sampling/importance_sampling_ratio/mean": 0.9577789306640625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.382405281066895, "sampling/sampling_logp_difference/mean": 0.174952894449234, "step": 33, "step_time": 115.10072640073486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1721.0, "completions/max_terminated_length": 1721.0, "completions/mean_length": 599.8125, "completions/mean_terminated_length": 599.8125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7060859650373459, "epoch": 0.08374384236453201, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038322618214211897, "kl": 0.0012007179611828178, "learning_rate": 4.9963108587563226e-05, "loss": -0.013989871367812157, "num_tokens": 5011160.0, "reward": 0.9765625, "reward_std": 0.2842378616333008, "rewards/reward_func/mean": 0.10850694444444445, "rewards/reward_func/std": 0.04346192214224073, "sampling/importance_sampling_ratio/max": 2.997807741165161, "sampling/importance_sampling_ratio/mean": 0.9572877287864685, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.312417030334473, "sampling/sampling_logp_difference/mean": 0.19250346720218658, "step": 34, "step_time": 64.04967820202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4077.0, "completions/mean_length": 914.203125, "completions/mean_terminated_length": 863.698486328125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7091882675886154, "epoch": 0.08620689655172414, "frac_reward_zero_std": 0.0, "grad_norm": 0.006479029390818219, "kl": 0.0012216337054269388, "learning_rate": 4.996042713912238e-05, "loss": 0.027525117620825768, "num_tokens": 5157541.0, "reward": 1.078125, "reward_std": 0.5379911661148071, "rewards/reward_func/mean": 0.11979166666666667, "rewards/reward_func/std": 0.10002665056122674, "sampling/importance_sampling_ratio/max": 2.9995338916778564, "sampling/importance_sampling_ratio/mean": 0.9554766416549683, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.678374290466309, "sampling/sampling_logp_difference/mean": 0.1935516595840454, "step": 35, "step_time": 158.35729796788655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3923.0, "completions/mean_length": 1026.171875, "completions/mean_terminated_length": 939.274169921875, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7677580267190933, "epoch": 0.08866995073891626, "frac_reward_zero_std": 0.25, "grad_norm": 0.003969688289252204, "kl": 0.001730678923195228, "learning_rate": 4.995765170581595e-05, "loss": -0.004793988540768623, "num_tokens": 5309680.0, "reward": 1.0390625, "reward_std": 0.5083016157150269, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.07348066899511549, "sampling/importance_sampling_ratio/max": 2.9985527992248535, "sampling/importance_sampling_ratio/mean": 0.9522184133529663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.271164894104004, "sampling/sampling_logp_difference/mean": 0.2057083398103714, "step": 36, "step_time": 119.47087240288965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3244.0, "completions/max_terminated_length": 3244.0, "completions/mean_length": 764.640625, "completions/mean_terminated_length": 776.3492431640625, "completions/min_length": 27.0, "completions/min_terminated_length": 121.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6654424071311951, "epoch": 0.09113300492610837, "frac_reward_zero_std": 0.0, "grad_norm": 0.01059432690079601, "kl": 0.0017997757968259975, "learning_rate": 4.995478229809444e-05, "loss": 0.06814411282539368, "num_tokens": 5442105.0, "reward": 1.0, "reward_std": 0.7440237998962402, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.11659446193112268, "sampling/importance_sampling_ratio/max": 2.9902219772338867, "sampling/importance_sampling_ratio/mean": 0.9585555791854858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.091999053955078, "sampling/sampling_logp_difference/mean": 0.18571698665618896, "step": 37, "step_time": 100.38183392700739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3807.0, "completions/mean_length": 1295.9375, "completions/mean_terminated_length": 1065.9482421875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6556531488895416, "epoch": 0.09359605911330049, "frac_reward_zero_std": 0.25, "grad_norm": 0.002583138605080032, "kl": 0.0012453852395992726, "learning_rate": 4.9951818926762174e-05, "loss": 0.004991541150957346, "num_tokens": 5616053.0, "reward": 1.0703125, "reward_std": 0.45090022683143616, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.06625068187713623, "sampling/importance_sampling_ratio/max": 2.9976985454559326, "sampling/importance_sampling_ratio/mean": 0.9467884302139282, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.92337417602539, "sampling/sampling_logp_difference/mean": 0.20371927320957184, "step": 38, "step_time": 146.80642993724905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2818.0, "completions/max_terminated_length": 2818.0, "completions/mean_length": 838.375, "completions/mean_terminated_length": 838.375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "degenerate_groups_filtered": 1.0, "entropy": 0.616532102227211, "epoch": 0.0960591133004926, "frac_reward_zero_std": 0.25, "grad_norm": 0.002685031367942058, "kl": 0.0010408478119643405, "learning_rate": 4.99487616029773e-05, "loss": -0.0135424192994833, "num_tokens": 5746829.0, "reward": 0.99609375, "reward_std": 0.2539067268371582, "rewards/reward_func/mean": 0.11067708333333333, "rewards/reward_func/std": 0.03920013705889384, "sampling/importance_sampling_ratio/max": 2.9999613761901855, "sampling/importance_sampling_ratio/mean": 0.9601303339004517, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.606135368347168, "sampling/sampling_logp_difference/mean": 0.16902627050876617, "step": 39, "step_time": 103.52780347992666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3299.0, "completions/mean_length": 1121.0, "completions/mean_terminated_length": 1025.0322265625, "completions/min_length": 79.0, "completions/min_terminated_length": 79.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6732313334941864, "epoch": 0.09852216748768473, "frac_reward_zero_std": 0.5, "grad_norm": 0.0017140794179257596, "kl": 0.0012895975669380277, "learning_rate": 4.994561033825174e-05, "loss": -0.0004059688653796911, "num_tokens": 5898221.0, "reward": 1.0703125, "reward_std": 0.32874444127082825, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.0472567660941018, "sampling/importance_sampling_ratio/max": 2.993136405944824, "sampling/importance_sampling_ratio/mean": 0.9536988735198975, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.736954689025879, "sampling/sampling_logp_difference/mean": 0.18731635808944702, "step": 40, "step_time": 145.77268586610444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3343.0, "completions/mean_length": 1018.46875, "completions/mean_terminated_length": 867.1146850585938, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6824723184108734, "epoch": 0.10098522167487685, "frac_reward_zero_std": 0.25, "grad_norm": 0.011154652575187822, "kl": 0.0008386526315007359, "learning_rate": 4.99423651444511e-05, "loss": -0.022859971970319748, "num_tokens": 6041355.0, "reward": 1.1015625, "reward_std": 0.7356008291244507, "rewards/reward_func/mean": 0.12239583333333333, "rewards/reward_func/std": 0.11811710231833988, "sampling/importance_sampling_ratio/max": 2.9998111724853516, "sampling/importance_sampling_ratio/mean": 0.9516376256942749, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.299010276794434, "sampling/sampling_logp_difference/mean": 0.19706137478351593, "step": 41, "step_time": 195.1581382418517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3209.0, "completions/mean_length": 1293.296875, "completions/mean_terminated_length": 1055.7796630859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5589832067489624, "epoch": 0.10344827586206896, "frac_reward_zero_std": 0.25, "grad_norm": 0.0025548626007616385, "kl": 0.0014480030513368547, "learning_rate": 4.993902603379471e-05, "loss": -0.005358518101274967, "num_tokens": 6211614.0, "reward": 0.98828125, "reward_std": 0.3603065609931946, "rewards/reward_func/mean": 0.10980902777777778, "rewards/reward_func/std": 0.05452203916178809, "sampling/importance_sampling_ratio/max": 2.999878406524658, "sampling/importance_sampling_ratio/mean": 0.9534763693809509, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.18749713897705, "sampling/sampling_logp_difference/mean": 0.1753091663122177, "step": 42, "step_time": 121.34253464243375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3998.0, "completions/max_terminated_length": 3998.0, "completions/mean_length": 952.796875, "completions/mean_terminated_length": 952.796875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6009114980697632, "epoch": 0.10591133004926108, "frac_reward_zero_std": 0.5, "grad_norm": 0.0015214392359086822, "kl": 0.001146472553955391, "learning_rate": 4.99355930188555e-05, "loss": 0.0028980104252696037, "num_tokens": 6353121.0, "reward": 1.07421875, "reward_std": 0.32635459303855896, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.04690552916791704, "sampling/importance_sampling_ratio/max": 2.999147653579712, "sampling/importance_sampling_ratio/mean": 0.9571791887283325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.376380920410156, "sampling/sampling_logp_difference/mean": 0.17375709116458893, "step": 43, "step_time": 124.00942197302356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2824.0, "completions/mean_length": 849.8125, "completions/mean_terminated_length": 798.2857666015625, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7530709803104401, "epoch": 0.10837438423645321, "frac_reward_zero_std": 0.75, "grad_norm": 0.0013039957282542235, "kl": 0.001282885583350435, "learning_rate": 4.9932066112559975e-05, "loss": -0.009485810995101929, "num_tokens": 6499989.0, "reward": 1.11328125, "reward_std": 0.39322054386138916, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.05615971154636807, "sampling/importance_sampling_ratio/max": 2.9940052032470703, "sampling/importance_sampling_ratio/mean": 0.950169563293457, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.745084762573242, "sampling/sampling_logp_difference/mean": 0.20593324303627014, "step": 44, "step_time": 131.83942927396856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2651.0, "completions/mean_length": 989.75, "completions/mean_terminated_length": 940.4445190429688, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7023928314447403, "epoch": 0.11083743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.002663078811979883, "kl": 0.0010599846136756241, "learning_rate": 4.992844532818821e-05, "loss": -0.017974235117435455, "num_tokens": 6656725.0, "reward": 0.984375, "reward_std": 0.29504841566085815, "rewards/reward_func/mean": 0.109375, "rewards/reward_func/std": 0.04466936323377821, "sampling/importance_sampling_ratio/max": 2.9983725547790527, "sampling/importance_sampling_ratio/mean": 0.9500045776367188, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.430854797363281, "sampling/sampling_logp_difference/mean": 0.20036830008029938, "step": 45, "step_time": 132.1835569611285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3980.0, "completions/mean_length": 1233.3125, "completions/mean_terminated_length": 1188.6773681640625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7167646884918213, "epoch": 0.11330049261083744, "frac_reward_zero_std": 0.0, "grad_norm": 0.0015582099411917988, "kl": 0.0030349711887538433, "learning_rate": 4.9924730679373735e-05, "loss": -0.0024594487622380257, "num_tokens": 6831081.0, "reward": 1.03125, "reward_std": 0.3258078992366791, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.050385665562417775, "sampling/importance_sampling_ratio/max": 2.998929977416992, "sampling/importance_sampling_ratio/mean": 0.9458686113357544, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.93498420715332, "sampling/sampling_logp_difference/mean": 0.20973166823387146, "step": 46, "step_time": 182.55769080412574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3671.0, "completions/mean_length": 931.59375, "completions/mean_terminated_length": 881.3651123046875, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6520289331674576, "epoch": 0.11576354679802955, "frac_reward_zero_std": 0.5, "grad_norm": 0.005328116211785687, "kl": 0.0031528117979178205, "learning_rate": 4.992092218010351e-05, "loss": 0.02094169706106186, "num_tokens": 6989151.0, "reward": 1.15625, "reward_std": 0.6950790882110596, "rewards/reward_func/mean": 0.1284722222222222, "rewards/reward_func/std": 0.10439738300111559, "sampling/importance_sampling_ratio/max": 2.9967551231384277, "sampling/importance_sampling_ratio/mean": 0.9473298788070679, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.523643493652344, "sampling/sampling_logp_difference/mean": 0.2068655788898468, "step": 47, "step_time": 140.63169015408494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 846.890625, "completions/mean_terminated_length": 769.3770141601562, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5718538910150528, "epoch": 0.11822660098522167, "frac_reward_zero_std": 0.75, "grad_norm": 0.0007047123566561605, "kl": 0.0011549281189218163, "learning_rate": 4.991701984471789e-05, "loss": 0.01098698377609253, "num_tokens": 7113624.0, "reward": 1.07421875, "reward_std": 0.2734251022338867, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.03352663583225674, "sampling/importance_sampling_ratio/max": 2.99415922164917, "sampling/importance_sampling_ratio/mean": 0.9616100192070007, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.49928092956543, "sampling/sampling_logp_difference/mean": 0.16297030448913574, "step": 48, "step_time": 119.48140913574025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3726.0, "completions/mean_length": 1220.875, "completions/mean_terminated_length": 1079.475341796875, "completions/min_length": 266.0, "completions/min_terminated_length": 266.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6797353476285934, "epoch": 0.1206896551724138, "frac_reward_zero_std": 0.5, "grad_norm": 0.0020895404039787254, "kl": 0.0014812646841164678, "learning_rate": 4.9913023687910575e-05, "loss": 0.0024993023835122585, "num_tokens": 7288064.0, "reward": 1.0625, "reward_std": 0.37796446681022644, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.05528419050905439, "sampling/importance_sampling_ratio/max": 2.993243455886841, "sampling/importance_sampling_ratio/mean": 0.9490103125572205, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.937416076660156, "sampling/sampling_logp_difference/mean": 0.20558039844036102, "step": 49, "step_time": 167.97329160943627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 993.40625, "completions/mean_terminated_length": 944.1588134765625, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7639325112104416, "epoch": 0.12315270935960591, "frac_reward_zero_std": 0.25, "grad_norm": 0.0012238713070192111, "kl": 0.0012491706293076277, "learning_rate": 4.990893372472849e-05, "loss": 0.01330800261348486, "num_tokens": 7441626.0, "reward": 1.06640625, "reward_std": 0.2790367007255554, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.035972247935003705, "sampling/importance_sampling_ratio/max": 2.9993691444396973, "sampling/importance_sampling_ratio/mean": 0.9422820806503296, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.27017879486084, "sampling/sampling_logp_difference/mean": 0.2233036458492279, "step": 50, "step_time": 180.49298912403174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3854.0, "completions/mean_length": 1377.78125, "completions/mean_terminated_length": 1220.7626953125, "completions/min_length": 357.0, "completions/min_terminated_length": 357.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7340656220912933, "epoch": 0.12561576354679804, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016767950961260724, "kl": 0.002402618178166449, "learning_rate": 4.99047499705718e-05, "loss": -0.00485864607617259, "num_tokens": 7610044.0, "reward": 1.10546875, "reward_std": 0.3640727698802948, "rewards/reward_func/mean": 0.1228298611111111, "rewards/reward_func/std": 0.05129980875386132, "sampling/importance_sampling_ratio/max": 2.9986684322357178, "sampling/importance_sampling_ratio/mean": 0.9486904144287109, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.795515060424805, "sampling/sampling_logp_difference/mean": 0.2021857649087906, "step": 51, "step_time": 124.5531223397702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2975.0, "completions/mean_length": 1016.59375, "completions/mean_terminated_length": 865.1475219726562, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6252937316894531, "epoch": 0.12807881773399016, "frac_reward_zero_std": 0.5, "grad_norm": 0.00038234811897031116, "kl": 0.000800235866336152, "learning_rate": 4.990047244119383e-05, "loss": -0.010930902324616909, "num_tokens": 7758786.0, "reward": 1.125, "reward_std": 0.3700064420700073, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.049859102401468486, "sampling/importance_sampling_ratio/max": 2.9963314533233643, "sampling/importance_sampling_ratio/mean": 0.9571857452392578, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.874775886535645, "sampling/sampling_logp_difference/mean": 0.1737726330757141, "step": 52, "step_time": 173.18570705433376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3552.0, "completions/mean_length": 1157.84375, "completions/mean_terminated_length": 1111.2064208984375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6483455449342728, "epoch": 0.13054187192118227, "frac_reward_zero_std": 0.25, "grad_norm": 0.007614191151348716, "kl": 0.0013027136301388964, "learning_rate": 4.9896101152701e-05, "loss": 0.021573293954133987, "num_tokens": 7917912.0, "reward": 1.14453125, "reward_std": 0.6280555129051208, "rewards/reward_func/mean": 0.1271701388888889, "rewards/reward_func/std": 0.11123616165584987, "sampling/importance_sampling_ratio/max": 2.9958243370056152, "sampling/importance_sampling_ratio/mean": 0.955902099609375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.749975204467773, "sampling/sampling_logp_difference/mean": 0.18389251828193665, "step": 53, "step_time": 128.83531658397987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3641.0, "completions/mean_length": 1074.78125, "completions/mean_terminated_length": 977.3225708007812, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7957058399915695, "epoch": 0.1330049261083744, "frac_reward_zero_std": 0.0, "grad_norm": 0.0040000183115382806, "kl": 0.0013368913641897961, "learning_rate": 4.9891636121552745e-05, "loss": 0.026843538507819176, "num_tokens": 8075082.0, "reward": 1.0390625, "reward_std": 0.37059247493743896, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.055034501685036555, "sampling/importance_sampling_ratio/max": 2.994692325592041, "sampling/importance_sampling_ratio/mean": 0.9410925507545471, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.202977180480957, "sampling/sampling_logp_difference/mean": 0.2365683615207672, "step": 54, "step_time": 130.15844374126755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3922.0, "completions/mean_length": 1037.890625, "completions/mean_terminated_length": 939.2418823242188, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6936090290546417, "epoch": 0.1354679802955665, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026880063656234054, "kl": 0.0012836718233302236, "learning_rate": 4.988707736456151e-05, "loss": 0.020552635192871094, "num_tokens": 8223907.0, "reward": 1.01953125, "reward_std": 0.2897203266620636, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.045329956544770136, "sampling/importance_sampling_ratio/max": 2.9959287643432617, "sampling/importance_sampling_ratio/mean": 0.9536117315292358, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.170980453491211, "sampling/sampling_logp_difference/mean": 0.1910027265548706, "step": 55, "step_time": 131.79171376302838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2972.0, "completions/mean_length": 911.46875, "completions/mean_terminated_length": 808.7418823242188, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7065327614545822, "epoch": 0.13793103448275862, "frac_reward_zero_std": 0.5, "grad_norm": 0.0012146386659164553, "kl": 0.0009278918150812387, "learning_rate": 4.9882424898892635e-05, "loss": 0.0029757781885564327, "num_tokens": 8360993.0, "reward": 1.0625, "reward_std": 0.2920915186405182, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.04098213298453225, "sampling/importance_sampling_ratio/max": 2.9910311698913574, "sampling/importance_sampling_ratio/mean": 0.9575042724609375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.249921798706055, "sampling/sampling_logp_difference/mean": 0.19161823391914368, "step": 56, "step_time": 113.03869129787199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3818.0, "completions/mean_length": 1330.109375, "completions/mean_terminated_length": 1145.7166748046875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7262924760580063, "epoch": 0.14039408866995073, "frac_reward_zero_std": 1.0, "grad_norm": 9.350445461896138e-05, "kl": 0.0008153790549840778, "learning_rate": 4.987767874206428e-05, "loss": 1.2529770174296573e-05, "num_tokens": 8527992.0, "reward": 1.15625, "reward_std": 0.36596253514289856, "rewards/reward_func/mean": 0.1284722222222222, "rewards/reward_func/std": 0.04066250390476651, "sampling/importance_sampling_ratio/max": 2.998924493789673, "sampling/importance_sampling_ratio/mean": 0.9534813165664673, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.857730865478516, "sampling/sampling_logp_difference/mean": 0.19994306564331055, "step": 57, "step_time": 175.88998585077934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2244.0, "completions/mean_length": 745.0625, "completions/mean_terminated_length": 691.873046875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7556774914264679, "epoch": 0.14285714285714285, "frac_reward_zero_std": 0.0, "grad_norm": 0.06262985175890678, "kl": 0.1303897971083643, "learning_rate": 4.987283891194743e-05, "loss": -0.03529379516839981, "num_tokens": 8660748.0, "reward": 1.1171875, "reward_std": 0.47761133313179016, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.06965038345919715, "sampling/importance_sampling_ratio/max": 2.997659206390381, "sampling/importance_sampling_ratio/mean": 0.956047773361206, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.499494552612305, "sampling/sampling_logp_difference/mean": 0.19947615265846252, "step": 58, "step_time": 164.91047239373438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2787.0, "completions/mean_length": 877.609375, "completions/mean_terminated_length": 826.5238647460938, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6654725968837738, "epoch": 0.14532019704433496, "frac_reward_zero_std": 0.5, "grad_norm": 0.0013078254591733234, "kl": 0.0010809154191520065, "learning_rate": 4.986790542676576e-05, "loss": -0.005135550629347563, "num_tokens": 8806419.0, "reward": 1.046875, "reward_std": 0.2667968273162842, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.038036055862903595, "sampling/importance_sampling_ratio/max": 2.996380090713501, "sampling/importance_sampling_ratio/mean": 0.9533485174179077, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.1231632232666, "sampling/sampling_logp_difference/mean": 0.19166377186775208, "step": 59, "step_time": 118.16056217276491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1737.0, "completions/mean_length": 954.046875, "completions/mean_terminated_length": 792.5000610351562, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7431278079748154, "epoch": 0.1477832512315271, "frac_reward_zero_std": 0.0, "grad_norm": 0.0009579017109342759, "kl": 0.0010434478608658537, "learning_rate": 4.986287830509558e-05, "loss": -0.007221859414130449, "num_tokens": 8964854.0, "reward": 1.06640625, "reward_std": 0.3219551742076874, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.04488379342688455, "sampling/importance_sampling_ratio/max": 2.999706745147705, "sampling/importance_sampling_ratio/mean": 0.9473176598548889, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.110328674316406, "sampling/sampling_logp_difference/mean": 0.21286305785179138, "step": 60, "step_time": 138.2910026947502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 617.0, "completions/mean_terminated_length": 612.84130859375, "completions/min_length": 251.0, "completions/min_terminated_length": 251.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6504525691270828, "epoch": 0.15024630541871922, "frac_reward_zero_std": 0.0, "grad_norm": 0.0026108009768812056, "kl": 0.001221363214426674, "learning_rate": 4.985775756586581e-05, "loss": -0.018695060163736343, "num_tokens": 9094054.0, "reward": 1.00390625, "reward_std": 0.28692469000816345, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.04380870693259769, "sampling/importance_sampling_ratio/max": 2.9986069202423096, "sampling/importance_sampling_ratio/mean": 0.9576082229614258, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.987771987915039, "sampling/sampling_logp_difference/mean": 0.17746970057487488, "step": 61, "step_time": 64.66796927712858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3212.0, "completions/mean_length": 1179.359375, "completions/mean_terminated_length": 1085.274169921875, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6409039795398712, "epoch": 0.15270935960591134, "frac_reward_zero_std": 0.25, "grad_norm": 0.0034371950102880864, "kl": 0.0011749721161322668, "learning_rate": 4.9852543228357835e-05, "loss": 0.029548635706305504, "num_tokens": 9254781.0, "reward": 1.0, "reward_std": 0.29880714416503906, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.04573592378033532, "sampling/importance_sampling_ratio/max": 2.998082160949707, "sampling/importance_sampling_ratio/mean": 0.9520100355148315, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.498339653015137, "sampling/sampling_logp_difference/mean": 0.193486750125885, "step": 62, "step_time": 167.41590802790597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3880.0, "completions/mean_length": 791.21875, "completions/mean_terminated_length": 745.0967407226562, "completions/min_length": 178.0, "completions/min_terminated_length": 178.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7853821069002151, "epoch": 0.15517241379310345, "frac_reward_zero_std": 0.25, "grad_norm": 0.004548122568676759, "kl": 0.0013982994423713535, "learning_rate": 4.9847235312205484e-05, "loss": -0.033901821821928024, "num_tokens": 9385451.0, "reward": 0.93359375, "reward_std": 0.38638070225715637, "rewards/reward_func/mean": 0.1037326388888889, "rewards/reward_func/std": 0.05757651891973284, "sampling/importance_sampling_ratio/max": 2.99320650100708, "sampling/importance_sampling_ratio/mean": 0.9517427682876587, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.464980125427246, "sampling/sampling_logp_difference/mean": 0.20454317331314087, "step": 63, "step_time": 117.68562039383687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3050.0, "completions/mean_length": 1021.453125, "completions/mean_terminated_length": 921.9671630859375, "completions/min_length": 289.0, "completions/min_terminated_length": 289.0, "degenerate_groups_filtered": 1.0, "entropy": 0.5929539352655411, "epoch": 0.15763546798029557, "frac_reward_zero_std": 0.5, "grad_norm": 0.0027889445903394447, "kl": 0.0008723176724743098, "learning_rate": 4.984183383739496e-05, "loss": -0.021047594025731087, "num_tokens": 9532632.0, "reward": 1.01171875, "reward_std": 0.36847415566444397, "rewards/reward_func/mean": 0.11241319444444445, "rewards/reward_func/std": 0.05727195077472263, "sampling/importance_sampling_ratio/max": 2.9981136322021484, "sampling/importance_sampling_ratio/mean": 0.9593861699104309, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.235153198242188, "sampling/sampling_logp_difference/mean": 0.16657081246376038, "step": 64, "step_time": 131.22120441007428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3649.0, "completions/mean_length": 1245.90625, "completions/mean_terminated_length": 1168.475341796875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6179927885532379, "epoch": 0.16009852216748768, "frac_reward_zero_std": 0.0, "grad_norm": 0.004497392045849199, "kl": 0.0008117440593196079, "learning_rate": 4.983633882426471e-05, "loss": -0.004598885774612427, "num_tokens": 9694866.0, "reward": 0.9609375, "reward_std": 0.4569106698036194, "rewards/reward_func/mean": 0.10677083333333333, "rewards/reward_func/std": 0.06866345471805996, "sampling/importance_sampling_ratio/max": 2.9998435974121094, "sampling/importance_sampling_ratio/mean": 0.9481761455535889, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.188133239746094, "sampling/sampling_logp_difference/mean": 0.19589340686798096, "step": 65, "step_time": 133.8538687042892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3100.0, "completions/mean_length": 750.578125, "completions/mean_terminated_length": 685.9835815429688, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7141001224517822, "epoch": 0.1625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.006127008410444587, "kl": 0.001061671442585066, "learning_rate": 4.983075029350542e-05, "loss": -0.015230651013553143, "num_tokens": 9823479.0, "reward": 1.05859375, "reward_std": 0.5820431113243103, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.12548058893945482, "sampling/importance_sampling_ratio/max": 2.998790740966797, "sampling/importance_sampling_ratio/mean": 0.9558759331703186, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.232525825500488, "sampling/sampling_logp_difference/mean": 0.18798446655273438, "step": 66, "step_time": 142.64381241961382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1055.546875, "completions/mean_terminated_length": 959.5409545898438, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5656583607196808, "epoch": 0.16502463054187191, "frac_reward_zero_std": 0.0, "grad_norm": 0.006161927067566899, "kl": 0.0006807406462030485, "learning_rate": 4.9825068266159894e-05, "loss": -0.014742434024810791, "num_tokens": 9974714.0, "reward": 1.0, "reward_std": 0.6666666865348816, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.09259259700775146, "sampling/importance_sampling_ratio/max": 2.999696969985962, "sampling/importance_sampling_ratio/mean": 0.9569911956787109, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.746885299682617, "sampling/sampling_logp_difference/mean": 0.17171627283096313, "step": 67, "step_time": 173.7841714611277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4043.0, "completions/max_terminated_length": 4043.0, "completions/mean_length": 888.375, "completions/mean_terminated_length": 895.730224609375, "completions/min_length": 85.0, "completions/min_terminated_length": 85.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6102766692638397, "epoch": 0.16748768472906403, "frac_reward_zero_std": 0.0, "grad_norm": 0.007500581448964243, "kl": 0.0015853389049880207, "learning_rate": 4.981929276362298e-05, "loss": 0.08089423179626465, "num_tokens": 10117634.0, "reward": 0.90625, "reward_std": 0.8643053770065308, "rewards/reward_func/mean": 0.10069444444444445, "rewards/reward_func/std": 0.12619537777370876, "sampling/importance_sampling_ratio/max": 2.989180564880371, "sampling/importance_sampling_ratio/mean": 0.9547425508499146, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.812376976013184, "sampling/sampling_logp_difference/mean": 0.17781385779380798, "step": 68, "step_time": 125.53745425422676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3739.0, "completions/mean_length": 1150.171875, "completions/mean_terminated_length": 1053.2930908203125, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6405633389949799, "epoch": 0.16995073891625614, "frac_reward_zero_std": 0.0, "grad_norm": 0.005291742203731279, "kl": 0.0008613676036475226, "learning_rate": 4.981342380764149e-05, "loss": -0.03940670192241669, "num_tokens": 10279517.0, "reward": 0.88671875, "reward_std": 0.7584571838378906, "rewards/reward_func/mean": 0.09852430555555555, "rewards/reward_func/std": 0.1488193174203237, "sampling/importance_sampling_ratio/max": 2.999359369277954, "sampling/importance_sampling_ratio/mean": 0.9512237310409546, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.12166976928711, "sampling/sampling_logp_difference/mean": 0.18591460585594177, "step": 69, "step_time": 149.70758228283376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3832.0, "completions/mean_length": 879.140625, "completions/mean_terminated_length": 661.4827270507812, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8025156259536743, "epoch": 0.1724137931034483, "frac_reward_zero_std": 0.0, "grad_norm": 0.00684584761013416, "kl": 0.001387007927405648, "learning_rate": 4.980746142031414e-05, "loss": 0.01730230078101158, "num_tokens": 10415078.0, "reward": 0.91015625, "reward_std": 0.6977389454841614, "rewards/reward_func/mean": 0.10112847222222222, "rewards/reward_func/std": 0.13860311441951328, "sampling/importance_sampling_ratio/max": 2.9970333576202393, "sampling/importance_sampling_ratio/mean": 0.9527570009231567, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.19293212890625, "sampling/sampling_logp_difference/mean": 0.20155400037765503, "step": 70, "step_time": 183.71196515997872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3417.0, "completions/mean_length": 1053.296875, "completions/mean_terminated_length": 849.9649047851562, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6875295341014862, "epoch": 0.1748768472906404, "frac_reward_zero_std": 0.25, "grad_norm": 0.003532877881341452, "kl": 0.0006982934210100211, "learning_rate": 4.980140562409141e-05, "loss": -0.023352203890681267, "num_tokens": 10571929.0, "reward": 0.94140625, "reward_std": 0.5487037301063538, "rewards/reward_func/mean": 0.10460069444444445, "rewards/reward_func/std": 0.07910366521941291, "sampling/importance_sampling_ratio/max": 2.9906861782073975, "sampling/importance_sampling_ratio/mean": 0.9550516605377197, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.016915321350098, "sampling/sampling_logp_difference/mean": 0.18335659801959991, "step": 71, "step_time": 124.68412435986102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2762.0, "completions/mean_length": 913.6875, "completions/mean_terminated_length": 787.57373046875, "completions/min_length": 64.0, "completions/min_terminated_length": 64.0, "degenerate_groups_filtered": 0.0, "entropy": 0.752479761838913, "epoch": 0.17733990147783252, "frac_reward_zero_std": 0.0, "grad_norm": 0.005311994696832263, "kl": 0.0015363168495241553, "learning_rate": 4.979525644177554e-05, "loss": 0.015766549855470657, "num_tokens": 10717333.0, "reward": 0.91796875, "reward_std": 0.5001084804534912, "rewards/reward_func/mean": 0.10199652777777778, "rewards/reward_func/std": 0.0739565756585863, "sampling/importance_sampling_ratio/max": 2.9991776943206787, "sampling/importance_sampling_ratio/mean": 0.955678403377533, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.945234298706055, "sampling/sampling_logp_difference/mean": 0.19813895225524902, "step": 72, "step_time": 143.27259426680394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3845.0, "completions/mean_length": 1054.390625, "completions/mean_terminated_length": 957.360595703125, "completions/min_length": 14.0, "completions/min_terminated_length": 14.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6393098831176758, "epoch": 0.17980295566502463, "frac_reward_zero_std": 0.25, "grad_norm": 0.0011923495379162847, "kl": 0.0012014579260721803, "learning_rate": 4.978901389652039e-05, "loss": -0.019676342606544495, "num_tokens": 10870078.0, "reward": 1.07421875, "reward_std": 0.41230979561805725, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.06086550156275431, "sampling/importance_sampling_ratio/max": 2.9990031719207764, "sampling/importance_sampling_ratio/mean": 0.9514544010162354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.248948097229004, "sampling/sampling_logp_difference/mean": 0.19227707386016846, "step": 73, "step_time": 188.85755535191856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4092.0, "completions/max_terminated_length": 4092.0, "completions/mean_length": 838.75, "completions/mean_terminated_length": 840.5573120117188, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "degenerate_groups_filtered": 0.0, "entropy": 0.653815358877182, "epoch": 0.18226600985221675, "frac_reward_zero_std": 0.0, "grad_norm": 0.009966630550985659, "kl": 0.0005712288402719423, "learning_rate": 4.978267801183133e-05, "loss": -0.03280792012810707, "num_tokens": 11001822.0, "reward": 1.0625, "reward_std": 0.7278474569320679, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.1253587090306812, "sampling/importance_sampling_ratio/max": 2.995110273361206, "sampling/importance_sampling_ratio/mean": 0.9579760432243347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.561847686767578, "sampling/sampling_logp_difference/mean": 0.16925372183322906, "step": 74, "step_time": 126.91733171069063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3472.0, "completions/mean_length": 1195.765625, "completions/mean_terminated_length": 1053.131103515625, "completions/min_length": 360.0, "completions/min_terminated_length": 360.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7155862152576447, "epoch": 0.18472906403940886, "frac_reward_zero_std": 0.5, "grad_norm": 0.0036799305426663165, "kl": 0.001092532867914997, "learning_rate": 4.977624881156524e-05, "loss": 0.02602587826550007, "num_tokens": 11167903.0, "reward": 0.98046875, "reward_std": 0.378763347864151, "rewards/reward_func/mean": 0.10894097222222222, "rewards/reward_func/std": 0.05723588830894894, "sampling/importance_sampling_ratio/max": 2.997964382171631, "sampling/importance_sampling_ratio/mean": 0.9472610950469971, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.248061180114746, "sampling/sampling_logp_difference/mean": 0.20667724311351776, "step": 75, "step_time": 129.2886537532322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2813.0, "completions/mean_length": 1064.53125, "completions/mean_terminated_length": 966.7418823242188, "completions/min_length": 4.0, "completions/min_terminated_length": 4.0, "degenerate_groups_filtered": 1.0, "entropy": 0.721578374505043, "epoch": 0.18719211822660098, "frac_reward_zero_std": 0.25, "grad_norm": 0.004605488257337456, "kl": 0.002601891625090502, "learning_rate": 4.976972631993033e-05, "loss": -0.03658334165811539, "num_tokens": 11324945.0, "reward": 0.921875, "reward_std": 0.4753340482711792, "rewards/reward_func/mean": 0.10243055555555555, "rewards/reward_func/std": 0.06980303592152065, "sampling/importance_sampling_ratio/max": 2.995720148086548, "sampling/importance_sampling_ratio/mean": 0.9485733509063721, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.749515533447266, "sampling/sampling_logp_difference/mean": 0.21007245779037476, "step": 76, "step_time": 135.25108521990478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3425.0, "completions/mean_length": 1197.34375, "completions/mean_terminated_length": 1135.91796875, "completions/min_length": 393.0, "completions/min_terminated_length": 393.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6039802730083466, "epoch": 0.1896551724137931, "frac_reward_zero_std": 0.5, "grad_norm": 0.004743221131144578, "kl": 0.0006622753426199779, "learning_rate": 4.976311056148609e-05, "loss": 0.03427097201347351, "num_tokens": 11489447.0, "reward": 1.1171875, "reward_std": 0.7042511701583862, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.10813031593958537, "sampling/importance_sampling_ratio/max": 2.9994306564331055, "sampling/importance_sampling_ratio/mean": 0.9550964832305908, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.74364471435547, "sampling/sampling_logp_difference/mean": 0.17511072754859924, "step": 77, "step_time": 129.62398432497866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3571.0, "completions/mean_length": 1217.953125, "completions/mean_terminated_length": 1170.725830078125, "completions/min_length": 16.0, "completions/min_terminated_length": 16.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5881828814744949, "epoch": 0.1921182266009852, "frac_reward_zero_std": 0.25, "grad_norm": 0.004901360594166145, "kl": 0.0007936922629596666, "learning_rate": 4.975640156114322e-05, "loss": -0.009430614300072193, "num_tokens": 11659988.0, "reward": 1.0546875, "reward_std": 0.6163589358329773, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.10998319089412689, "sampling/importance_sampling_ratio/max": 2.995478391647339, "sampling/importance_sampling_ratio/mean": 0.9541982412338257, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.298104286193848, "sampling/sampling_logp_difference/mean": 0.177594393491745, "step": 78, "step_time": 132.9943831146229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2884.0, "completions/max_terminated_length": 2884.0, "completions/mean_length": 1001.3125, "completions/mean_terminated_length": 1001.3125, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6656395643949509, "epoch": 0.19458128078817735, "frac_reward_zero_std": 0.25, "grad_norm": 0.006107805006785518, "kl": 0.0009958412119885907, "learning_rate": 4.974959934416346e-05, "loss": 0.006846698001027107, "num_tokens": 11805816.0, "reward": 1.23828125, "reward_std": 0.5426816344261169, "rewards/reward_func/mean": 0.13758680555555555, "rewards/reward_func/std": 0.09264064538809988, "sampling/importance_sampling_ratio/max": 2.9980006217956543, "sampling/importance_sampling_ratio/mean": 0.9504839181900024, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.141356468200684, "sampling/sampling_logp_difference/mean": 0.1968192458152771, "step": 79, "step_time": 88.39881527097896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3718.0, "completions/mean_length": 962.09375, "completions/mean_terminated_length": 929.6500244140625, "completions/min_length": 34.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 0.0, "entropy": 0.612749919295311, "epoch": 0.19704433497536947, "frac_reward_zero_std": 0.25, "grad_norm": 0.009568223063933647, "kl": 0.00103502553247381, "learning_rate": 4.9742703936159586e-05, "loss": -0.10045486688613892, "num_tokens": 11939118.0, "reward": 1.125, "reward_std": 0.6251983642578125, "rewards/reward_func/mean": 0.125, "rewards/reward_func/std": 0.11257308059268528, "sampling/importance_sampling_ratio/max": 2.9995410442352295, "sampling/importance_sampling_ratio/mean": 0.9614801406860352, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.320719718933105, "sampling/sampling_logp_difference/mean": 0.168918639421463, "step": 80, "step_time": 138.59337026928551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2942.0, "completions/max_terminated_length": 2942.0, "completions/mean_length": 777.546875, "completions/mean_terminated_length": 777.546875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "degenerate_groups_filtered": 0.0, "entropy": 0.716793105006218, "epoch": 0.19950738916256158, "frac_reward_zero_std": 0.25, "grad_norm": 0.004065946912247252, "kl": 0.0015465420437976718, "learning_rate": 4.973571536309525e-05, "loss": 0.03413772210478783, "num_tokens": 12091009.0, "reward": 1.04296875, "reward_std": 0.40959399938583374, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.060785247219933405, "sampling/importance_sampling_ratio/max": 2.9997410774230957, "sampling/importance_sampling_ratio/mean": 0.9524143934249878, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.888782501220703, "sampling/sampling_logp_difference/mean": 0.20447459816932678, "step": 81, "step_time": 88.59409447899088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3562.0, "completions/mean_length": 1248.171875, "completions/mean_terminated_length": 1137.1802978515625, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "degenerate_groups_filtered": 0.0, "entropy": 0.71332086622715, "epoch": 0.2019704433497537, "frac_reward_zero_std": 0.0, "grad_norm": 0.00811562576764538, "kl": 0.001496812139521353, "learning_rate": 4.9728633651284914e-05, "loss": 0.02764129638671875, "num_tokens": 12264156.0, "reward": 1.22265625, "reward_std": 0.8730047345161438, "rewards/reward_func/mean": 0.13585069444444445, "rewards/reward_func/std": 0.14530256390571594, "sampling/importance_sampling_ratio/max": 2.997504711151123, "sampling/importance_sampling_ratio/mean": 0.9448159337043762, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.87446403503418, "sampling/sampling_logp_difference/mean": 0.21683049201965332, "step": 82, "step_time": 166.05172082805075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3426.0, "completions/max_terminated_length": 3426.0, "completions/mean_length": 822.71875, "completions/mean_terminated_length": 822.71875, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6466097682714462, "epoch": 0.2044334975369458, "frac_reward_zero_std": 0.5, "grad_norm": 0.002471536066340347, "kl": 0.0014289210666902363, "learning_rate": 4.972145882739374e-05, "loss": -0.0035694753751158714, "num_tokens": 12403786.0, "reward": 1.00390625, "reward_std": 0.23776483535766602, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.0367136730088128, "sampling/importance_sampling_ratio/max": 2.9996731281280518, "sampling/importance_sampling_ratio/mean": 0.9583299160003662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.391860961914062, "sampling/sampling_logp_difference/mean": 0.17760473489761353, "step": 83, "step_time": 102.87069191993214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3689.0, "completions/mean_length": 1022.359375, "completions/mean_terminated_length": 973.5714721679688, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7003387361764908, "epoch": 0.20689655172413793, "frac_reward_zero_std": 0.75, "grad_norm": 0.0011806928817891115, "kl": 0.0011969898623647168, "learning_rate": 4.971419091843748e-05, "loss": 0.006968214176595211, "num_tokens": 12547905.0, "reward": 1.06640625, "reward_std": 0.2895062267780304, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.04047108110454348, "sampling/importance_sampling_ratio/max": 2.9965357780456543, "sampling/importance_sampling_ratio/mean": 0.9557406902313232, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.999181747436523, "sampling/sampling_logp_difference/mean": 0.18569059669971466, "step": 84, "step_time": 134.5772271042224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2939.0, "completions/max_terminated_length": 2939.0, "completions/mean_length": 605.40625, "completions/mean_terminated_length": 605.40625, "completions/min_length": 66.0, "completions/min_terminated_length": 66.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7465554922819138, "epoch": 0.20935960591133004, "frac_reward_zero_std": 0.25, "grad_norm": 0.004734667031571098, "kl": 0.004191852669464424, "learning_rate": 4.970682995178238e-05, "loss": 0.019574182108044624, "num_tokens": 12666683.0, "reward": 1.0234375, "reward_std": 0.3583437204360962, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.05403099126285977, "sampling/importance_sampling_ratio/max": 2.9961087703704834, "sampling/importance_sampling_ratio/mean": 0.959001362323761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.754371643066406, "sampling/sampling_logp_difference/mean": 0.18795375525951385, "step": 85, "step_time": 83.09169630077668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3817.0, "completions/max_terminated_length": 3817.0, "completions/mean_length": 956.046875, "completions/mean_terminated_length": 923.4127807617188, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7266160845756531, "epoch": 0.21182266009852216, "frac_reward_zero_std": 0.25, "grad_norm": 0.0032079723243338284, "kl": 0.0021485694451257586, "learning_rate": 4.9699375955145114e-05, "loss": 0.008961433544754982, "num_tokens": 12820238.0, "reward": 1.03515625, "reward_std": 0.40548616647720337, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.0595403081840939, "sampling/importance_sampling_ratio/max": 2.998619794845581, "sampling/importance_sampling_ratio/mean": 0.9531738758087158, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.032673835754395, "sampling/sampling_logp_difference/mean": 0.19796302914619446, "step": 86, "step_time": 137.24125836323947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 514.96875, "completions/mean_terminated_length": 514.96875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7525362968444824, "epoch": 0.21428571428571427, "frac_reward_zero_std": 0.25, "grad_norm": 0.005333796839874563, "kl": 0.0048426192661281675, "learning_rate": 4.96918289565926e-05, "loss": 0.01707097887992859, "num_tokens": 12935980.0, "reward": 0.96875, "reward_std": 0.2869516909122467, "rewards/reward_func/mean": 0.1076388888888889, "rewards/reward_func/std": 0.04373177720440759, "sampling/importance_sampling_ratio/max": 2.9986844062805176, "sampling/importance_sampling_ratio/mean": 0.9631032943725586, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.843316078186035, "sampling/sampling_logp_difference/mean": 0.18259428441524506, "step": 87, "step_time": 66.68644723505713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3148.0, "completions/mean_length": 958.21875, "completions/mean_terminated_length": 857.0, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7587482780218124, "epoch": 0.21674876847290642, "frac_reward_zero_std": 0.75, "grad_norm": 0.0011093462895115053, "kl": 0.0011742105707526207, "learning_rate": 4.968418898454199e-05, "loss": -0.019757457077503204, "num_tokens": 13079658.0, "reward": 1.203125, "reward_std": 0.47114139795303345, "rewards/reward_func/mean": 0.13368055555555555, "rewards/reward_func/std": 0.0649379442135493, "sampling/importance_sampling_ratio/max": 2.9928460121154785, "sampling/importance_sampling_ratio/mean": 0.951276421546936, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.812447547912598, "sampling/sampling_logp_difference/mean": 0.2016531229019165, "step": 88, "step_time": 175.04604559391737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3305.0, "completions/mean_length": 1278.84375, "completions/mean_terminated_length": 1195.0982666015625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6959878653287888, "epoch": 0.21921182266009853, "frac_reward_zero_std": 0.75, "grad_norm": 0.00018656617621445748, "kl": 0.00109332193096634, "learning_rate": 4.967645606776047e-05, "loss": -0.002903047716245055, "num_tokens": 13240544.0, "reward": 1.12109375, "reward_std": 0.33627331256866455, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.04050926036304898, "sampling/importance_sampling_ratio/max": 2.995325803756714, "sampling/importance_sampling_ratio/mean": 0.9519744515419006, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.622486114501953, "sampling/sampling_logp_difference/mean": 0.199794203042984, "step": 89, "step_time": 172.4871400659904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3005.0, "completions/mean_length": 948.015625, "completions/mean_terminated_length": 832.91796875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7445888072252274, "epoch": 0.22167487684729065, "frac_reward_zero_std": 0.5, "grad_norm": 0.002568082327493114, "kl": 0.0015613814030075446, "learning_rate": 4.966863023536523e-05, "loss": -0.0222585778683424, "num_tokens": 13385345.0, "reward": 1.05859375, "reward_std": 0.34429070353507996, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.050396261943711176, "sampling/importance_sampling_ratio/max": 2.9980051517486572, "sampling/importance_sampling_ratio/mean": 0.95062255859375, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.059782028198242, "sampling/sampling_logp_difference/mean": 0.20940154790878296, "step": 90, "step_time": 131.1308980169706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3719.0, "completions/mean_length": 1014.3125, "completions/mean_terminated_length": 924.6557006835938, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7877677083015442, "epoch": 0.22413793103448276, "frac_reward_zero_std": 0.25, "grad_norm": 0.0007955641288236894, "kl": 0.001454152661608532, "learning_rate": 4.96607115168233e-05, "loss": -0.02363387495279312, "num_tokens": 13535109.0, "reward": 1.0703125, "reward_std": 0.32874444127082825, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.0472567660941018, "sampling/importance_sampling_ratio/max": 2.998769998550415, "sampling/importance_sampling_ratio/mean": 0.9478331804275513, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.211028099060059, "sampling/sampling_logp_difference/mean": 0.21998655796051025, "step": 91, "step_time": 114.45941002899781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3213.0, "completions/mean_length": 892.671875, "completions/mean_terminated_length": 841.825439453125, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6217218488454819, "epoch": 0.22660098522167488, "frac_reward_zero_std": 0.0, "grad_norm": 0.001966924586813133, "kl": 0.001591385604115203, "learning_rate": 4.965269994195146e-05, "loss": -0.009086966514587402, "num_tokens": 13672560.0, "reward": 1.09375, "reward_std": 0.41187721490859985, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.05926263497935401, "sampling/importance_sampling_ratio/max": 2.9972190856933594, "sampling/importance_sampling_ratio/mean": 0.9610211849212646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.113224983215332, "sampling/sampling_logp_difference/mean": 0.16949692368507385, "step": 92, "step_time": 160.50747915613465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3949.0, "completions/mean_length": 1133.765625, "completions/mean_terminated_length": 1086.74609375, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6935625523328781, "epoch": 0.229064039408867, "frac_reward_zero_std": 0.5, "grad_norm": 0.0010926697027660048, "kl": 0.0019639305828604847, "learning_rate": 4.964459554091615e-05, "loss": 0.007769447285681963, "num_tokens": 13826593.0, "reward": 1.08203125, "reward_std": 0.3022885024547577, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.03856059287985166, "sampling/importance_sampling_ratio/max": 2.9993722438812256, "sampling/importance_sampling_ratio/mean": 0.9566208720207214, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.104926109313965, "sampling/sampling_logp_difference/mean": 0.18871435523033142, "step": 93, "step_time": 126.3685281840153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2425.0, "completions/max_terminated_length": 2425.0, "completions/mean_length": 994.125, "completions/mean_terminated_length": 994.125, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6710119396448135, "epoch": 0.2315270935960591, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010828470047238532, "kl": 0.0010478518815943971, "learning_rate": 4.9636398344233294e-05, "loss": 2.0540661353152245e-05, "num_tokens": 13978137.0, "reward": 1.078125, "reward_std": 0.2704896926879883, "rewards/reward_func/mean": 0.11979166666666667, "rewards/reward_func/std": 0.030054413610034518, "sampling/importance_sampling_ratio/max": 2.997786521911621, "sampling/importance_sampling_ratio/mean": 0.9555579423904419, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.089488983154297, "sampling/sampling_logp_difference/mean": 0.18803386390209198, "step": 94, "step_time": 78.18217662116513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3947.0, "completions/mean_length": 1246.109375, "completions/mean_terminated_length": 1088.4482421875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6725437641143799, "epoch": 0.23399014778325122, "frac_reward_zero_std": 0.25, "grad_norm": 0.002324607148485349, "kl": 0.0010944021196337417, "learning_rate": 4.9628108382768255e-05, "loss": 0.025269202888011932, "num_tokens": 14144960.0, "reward": 1.140625, "reward_std": 0.3905505836009979, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.058287974860933095, "sampling/importance_sampling_ratio/max": 2.9950029850006104, "sampling/importance_sampling_ratio/mean": 0.9547150135040283, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.426436424255371, "sampling/sampling_logp_difference/mean": 0.1817963868379593, "step": 95, "step_time": 214.41529780323617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3043.0, "completions/mean_length": 975.09375, "completions/mean_terminated_length": 821.6065063476562, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7443526685237885, "epoch": 0.23645320197044334, "frac_reward_zero_std": 0.25, "grad_norm": 0.0023251836424105837, "kl": 0.0014684945635963231, "learning_rate": 4.9619725687735686e-05, "loss": -0.007826524786651134, "num_tokens": 14304006.0, "reward": 1.0859375, "reward_std": 0.36247265338897705, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.05503144032425351, "sampling/importance_sampling_ratio/max": 2.998788595199585, "sampling/importance_sampling_ratio/mean": 0.9454492926597595, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.376626014709473, "sampling/sampling_logp_difference/mean": 0.2146020233631134, "step": 96, "step_time": 132.66262619709596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3153.0, "completions/mean_length": 769.90625, "completions/mean_terminated_length": 717.1111450195312, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8366009593009949, "epoch": 0.23891625615763548, "frac_reward_zero_std": 0.25, "grad_norm": 0.002077514765880351, "kl": 0.0015410964551847428, "learning_rate": 4.96112502906994e-05, "loss": -0.012058844789862633, "num_tokens": 14438768.0, "reward": 1.09765625, "reward_std": 0.3716575503349304, "rewards/reward_func/mean": 0.12196180555555555, "rewards/reward_func/std": 0.05277948081493378, "sampling/importance_sampling_ratio/max": 2.99769926071167, "sampling/importance_sampling_ratio/mean": 0.9524117708206177, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.555198669433594, "sampling/sampling_logp_difference/mean": 0.212304025888443, "step": 97, "step_time": 117.63319608126767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2758.0, "completions/max_terminated_length": 2758.0, "completions/mean_length": 769.34375, "completions/mean_terminated_length": 778.2698974609375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6486907005310059, "epoch": 0.2413793103448276, "frac_reward_zero_std": 0.5, "grad_norm": 0.01006651140897198, "kl": 0.0011220714950468391, "learning_rate": 4.960268222357227e-05, "loss": -0.02731485851109028, "num_tokens": 14588486.0, "reward": 1.17578125, "reward_std": 0.5130822062492371, "rewards/reward_func/mean": 0.1306423611111111, "rewards/reward_func/std": 0.09101471718814638, "sampling/importance_sampling_ratio/max": 2.9995951652526855, "sampling/importance_sampling_ratio/mean": 0.9494529366493225, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.874653816223145, "sampling/sampling_logp_difference/mean": 0.20342113077640533, "step": 98, "step_time": 86.39957803068683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4018.0, "completions/mean_length": 1201.5625, "completions/mean_terminated_length": 1148.806396484375, "completions/min_length": 324.0, "completions/min_terminated_length": 324.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7736449986696243, "epoch": 0.2438423645320197, "frac_reward_zero_std": 0.5, "grad_norm": 0.000667562733610488, "kl": 0.001313261323957704, "learning_rate": 4.959402151861613e-05, "loss": -0.014927120879292488, "num_tokens": 14754538.0, "reward": 1.15234375, "reward_std": 0.40974533557891846, "rewards/reward_func/mean": 0.12803819444444445, "rewards/reward_func/std": 0.05651323828432295, "sampling/importance_sampling_ratio/max": 2.9992804527282715, "sampling/importance_sampling_ratio/mean": 0.9442236423492432, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.86728286743164, "sampling/sampling_logp_difference/mean": 0.22248251736164093, "step": 99, "step_time": 155.09293641196564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3318.0, "completions/max_terminated_length": 3318.0, "completions/mean_length": 1113.453125, "completions/mean_terminated_length": 1121.0318603515625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6962731033563614, "epoch": 0.24630541871921183, "frac_reward_zero_std": 0.5, "grad_norm": 0.0009398742824687371, "kl": 0.0014017132634762675, "learning_rate": 4.958526820844158e-05, "loss": 0.009164243005216122, "num_tokens": 14919655.0, "reward": 1.23828125, "reward_std": 0.44639137387275696, "rewards/reward_func/mean": 0.13758680555555555, "rewards/reward_func/std": 0.054410699754953384, "sampling/importance_sampling_ratio/max": 2.998028039932251, "sampling/importance_sampling_ratio/mean": 0.950190544128418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.881841659545898, "sampling/sampling_logp_difference/mean": 0.19850248098373413, "step": 100, "step_time": 105.25422466010787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 2808.0, "completions/max_terminated_length": 2808.0, "completions/mean_length": 960.515625, "completions/mean_terminated_length": 950.0819091796875, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "degenerate_groups_filtered": 0.0, "entropy": 0.723781481385231, "epoch": 0.24876847290640394, "frac_reward_zero_std": 0.25, "grad_norm": 0.002647153784592124, "kl": 0.001565072947414592, "learning_rate": 4.957642232600797e-05, "loss": 0.021879084408283234, "num_tokens": 15068664.0, "reward": 1.0, "reward_std": 0.24800793826580048, "rewards/reward_func/mean": 0.1111111111111111, "rewards/reward_func/std": 0.04210699929131402, "sampling/importance_sampling_ratio/max": 2.9967308044433594, "sampling/importance_sampling_ratio/mean": 0.9538641571998596, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.660208702087402, "sampling/sampling_logp_difference/mean": 0.19491901993751526, "step": 101, "step_time": 92.62138540088199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2175.0, "completions/max_terminated_length": 2175.0, "completions/mean_length": 833.9375, "completions/mean_terminated_length": 827.9683227539062, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6343219578266144, "epoch": 0.2512315270935961, "frac_reward_zero_std": 0.75, "grad_norm": 0.0007505687183929501, "kl": 0.0012052875244989991, "learning_rate": 4.956748390462316e-05, "loss": 0.00302310474216938, "num_tokens": 15217684.0, "reward": 1.08984375, "reward_std": 0.2966987192630768, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.0361149807771047, "sampling/importance_sampling_ratio/max": 2.995220184326172, "sampling/importance_sampling_ratio/mean": 0.9567204713821411, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.999521255493164, "sampling/sampling_logp_difference/mean": 0.187168151140213, "step": 102, "step_time": 94.69526713481173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3928.0, "completions/mean_length": 1041.28125, "completions/mean_terminated_length": 887.9166870117188, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7419612556695938, "epoch": 0.2536945812807882, "frac_reward_zero_std": 0.25, "grad_norm": 0.002342374375304448, "kl": 0.0018976301944348961, "learning_rate": 4.955845297794348e-05, "loss": -0.013131720013916492, "num_tokens": 15373478.0, "reward": 1.03125, "reward_std": 0.3288387358188629, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.048490075601471797, "sampling/importance_sampling_ratio/max": 2.998152256011963, "sampling/importance_sampling_ratio/mean": 0.9477394819259644, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.104028701782227, "sampling/sampling_logp_difference/mean": 0.20725718140602112, "step": 103, "step_time": 205.12117066117935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3203.0, "completions/mean_length": 1104.546875, "completions/mean_terminated_length": 1008.04833984375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6666458696126938, "epoch": 0.2561576354679803, "frac_reward_zero_std": 0.0, "grad_norm": 0.001985874134046432, "kl": 0.001993619982386008, "learning_rate": 4.954932957997359e-05, "loss": -0.012176139280200005, "num_tokens": 15524409.0, "reward": 1.05859375, "reward_std": 0.3799075484275818, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.05546691517035166, "sampling/importance_sampling_ratio/max": 2.996752977371216, "sampling/importance_sampling_ratio/mean": 0.9528929591178894, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.815652847290039, "sampling/sampling_logp_difference/mean": 0.192731574177742, "step": 104, "step_time": 122.30552988126874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3829.0, "completions/mean_length": 894.734375, "completions/mean_terminated_length": 843.920654296875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7775083035230637, "epoch": 0.25862068965517243, "frac_reward_zero_std": 0.25, "grad_norm": 0.0032020832600996134, "kl": 0.001513187715318054, "learning_rate": 4.954011374506632e-05, "loss": 0.0022926977835595608, "num_tokens": 15670424.0, "reward": 1.0703125, "reward_std": 0.35486623644828796, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.052947340740097895, "sampling/importance_sampling_ratio/max": 2.9996097087860107, "sampling/importance_sampling_ratio/mean": 0.9513598680496216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.919264793395996, "sampling/sampling_logp_difference/mean": 0.21757398545742035, "step": 105, "step_time": 133.78325367206708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3971.0, "completions/mean_length": 1172.4375, "completions/mean_terminated_length": 1028.6556396484375, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7460175901651382, "epoch": 0.26108374384236455, "frac_reward_zero_std": 0.5, "grad_norm": 0.0018893770374146888, "kl": 0.0018231416470371187, "learning_rate": 4.953080550792254e-05, "loss": -0.010393545962870121, "num_tokens": 15823796.0, "reward": 1.05859375, "reward_std": 0.3952651023864746, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.0587814019785987, "sampling/importance_sampling_ratio/max": 2.9988386631011963, "sampling/importance_sampling_ratio/mean": 0.9498333930969238, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.434999465942383, "sampling/sampling_logp_difference/mean": 0.205765038728714, "step": 106, "step_time": 131.6286746961996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3139.0, "completions/max_terminated_length": 3139.0, "completions/mean_length": 883.109375, "completions/mean_terminated_length": 883.109375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "degenerate_groups_filtered": 1.0, "entropy": 0.735322967171669, "epoch": 0.26354679802955666, "frac_reward_zero_std": 0.75, "grad_norm": 0.00035116229639673005, "kl": 0.0015636045136488974, "learning_rate": 4.952140490359108e-05, "loss": 0.00011009792069671676, "num_tokens": 15962427.0, "reward": 1.18359375, "reward_std": 0.3965180516242981, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.047183099720213145, "sampling/importance_sampling_ratio/max": 2.998013496398926, "sampling/importance_sampling_ratio/mean": 0.9541240930557251, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.514302253723145, "sampling/sampling_logp_difference/mean": 0.20346251130104065, "step": 107, "step_time": 105.58308988134377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3001.0, "completions/max_terminated_length": 3001.0, "completions/mean_length": 946.59375, "completions/mean_terminated_length": 946.59375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "degenerate_groups_filtered": 1.0, "entropy": 0.66323322057724, "epoch": 0.2660098522167488, "frac_reward_zero_std": 0.5, "grad_norm": 0.000932031147343889, "kl": 0.0015872836229391396, "learning_rate": 4.951191196746855e-05, "loss": -0.0009894074173644185, "num_tokens": 16113601.0, "reward": 1.11328125, "reward_std": 0.3419414758682251, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.04295487246579594, "sampling/importance_sampling_ratio/max": 2.993446111679077, "sampling/importance_sampling_ratio/mean": 0.9554958343505859, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.5941743850708, "sampling/sampling_logp_difference/mean": 0.1888352930545807, "step": 108, "step_time": 97.49898341693915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3759.0, "completions/mean_length": 842.390625, "completions/mean_terminated_length": 623.4745483398438, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.739134818315506, "epoch": 0.2684729064039409, "frac_reward_zero_std": 0.0, "grad_norm": 0.0018781326949399867, "kl": 0.0020018375653307885, "learning_rate": 4.950232673529922e-05, "loss": 0.0005048960447311401, "num_tokens": 16257498.0, "reward": 1.0234375, "reward_std": 0.29113471508026123, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.04292959802680545, "sampling/importance_sampling_ratio/max": 2.996737241744995, "sampling/importance_sampling_ratio/mean": 0.956794261932373, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.061861038208008, "sampling/sampling_logp_difference/mean": 0.19844233989715576, "step": 109, "step_time": 139.45302382390946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3361.0, "completions/mean_length": 930.125, "completions/mean_terminated_length": 894.04833984375, "completions/min_length": 1.0, "completions/min_terminated_length": 103.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7441273629665375, "epoch": 0.270935960591133, "frac_reward_zero_std": 0.5, "grad_norm": 0.0015235700232656716, "kl": 0.0016930506099015474, "learning_rate": 4.9492649243174894e-05, "loss": -0.03158733621239662, "num_tokens": 16415858.0, "reward": 1.0703125, "reward_std": 0.3631562292575836, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.05488494038581848, "sampling/importance_sampling_ratio/max": 2.996971607208252, "sampling/importance_sampling_ratio/mean": 0.9444047212600708, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.474295616149902, "sampling/sampling_logp_difference/mean": 0.2202835977077484, "step": 110, "step_time": 132.466704050079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2257.0, "completions/mean_length": 1196.625, "completions/mean_terminated_length": 1071.4833984375, "completions/min_length": 7.0, "completions/min_terminated_length": 291.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6919787973165512, "epoch": 0.2733990147783251, "frac_reward_zero_std": 0.75, "grad_norm": 0.001542486832013034, "kl": 0.0014645264309365302, "learning_rate": 4.948287952753475e-05, "loss": -0.008886125870049, "num_tokens": 16595290.0, "reward": 1.12890625, "reward_std": 0.4082292914390564, "rewards/reward_func/mean": 0.1254340277777778, "rewards/reward_func/std": 0.05789083242416382, "sampling/importance_sampling_ratio/max": 2.987086057662964, "sampling/importance_sampling_ratio/mean": 0.9435651302337646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.362783432006836, "sampling/sampling_logp_difference/mean": 0.21983519196510315, "step": 111, "step_time": 138.48055132851005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3345.0, "completions/mean_length": 994.453125, "completions/mean_terminated_length": 841.91796875, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7784600406885147, "epoch": 0.27586206896551724, "frac_reward_zero_std": 0.5, "grad_norm": 0.00044156116092231446, "kl": 0.002170788327930495, "learning_rate": 4.947301762516526e-05, "loss": -0.003606098936870694, "num_tokens": 16751047.0, "reward": 1.1484375, "reward_std": 0.37192854285240173, "rewards/reward_func/mean": 0.12760416666666666, "rewards/reward_func/std": 0.04553384002712038, "sampling/importance_sampling_ratio/max": 2.9941134452819824, "sampling/importance_sampling_ratio/mean": 0.9548653960227966, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.049118041992188, "sampling/sampling_logp_difference/mean": 0.20217272639274597, "step": 112, "step_time": 131.37046331982128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4091.0, "completions/mean_length": 1262.1875, "completions/mean_terminated_length": 1175.0509033203125, "completions/min_length": 10.0, "completions/min_terminated_length": 313.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7166846841573715, "epoch": 0.27832512315270935, "frac_reward_zero_std": 0.25, "grad_norm": 0.0022857984519741247, "kl": 0.0017328996036667377, "learning_rate": 4.946306357319997e-05, "loss": -0.003905626479536295, "num_tokens": 16924419.0, "reward": 1.15625, "reward_std": 0.47245559096336365, "rewards/reward_func/mean": 0.1284722222222222, "rewards/reward_func/std": 0.06712073087692261, "sampling/importance_sampling_ratio/max": 2.9988887310028076, "sampling/importance_sampling_ratio/mean": 0.9520525336265564, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.55725383758545, "sampling/sampling_logp_difference/mean": 0.1951729655265808, "step": 113, "step_time": 127.90999798686244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3223.0, "completions/mean_length": 950.78125, "completions/mean_terminated_length": 796.0983276367188, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7335464358329773, "epoch": 0.28078817733990147, "frac_reward_zero_std": 0.25, "grad_norm": 0.005655948261911067, "kl": 0.007333489367738366, "learning_rate": 4.9453017409119416e-05, "loss": -0.016323737800121307, "num_tokens": 17077733.0, "reward": 1.04296875, "reward_std": 0.40959399938583374, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.060785247219933405, "sampling/importance_sampling_ratio/max": 2.999847650527954, "sampling/importance_sampling_ratio/mean": 0.9522716999053955, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.624942779541016, "sampling/sampling_logp_difference/mean": 0.2057677060365677, "step": 114, "step_time": 179.57750754477456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2737.0, "completions/max_terminated_length": 2737.0, "completions/mean_length": 725.703125, "completions/mean_terminated_length": 725.703125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6960620880126953, "epoch": 0.2832512315270936, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011722383855332473, "kl": 0.0012825021694879979, "learning_rate": 4.9442879170750976e-05, "loss": 2.3127111489884555e-05, "num_tokens": 17196610.0, "reward": 1.0625, "reward_std": 0.24397501349449158, "rewards/reward_func/mean": 0.11805555555555555, "rewards/reward_func/std": 0.027108336488405865, "sampling/importance_sampling_ratio/max": 2.9984169006347656, "sampling/importance_sampling_ratio/mean": 0.9587997198104858, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.809239387512207, "sampling/sampling_logp_difference/mean": 0.17770594358444214, "step": 115, "step_time": 86.19620743719861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3126.0, "completions/mean_length": 824.265625, "completions/mean_terminated_length": 729.34423828125, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7041952311992645, "epoch": 0.2857142857142857, "frac_reward_zero_std": 0.75, "grad_norm": 0.0009556624442624107, "kl": 0.002167180966353044, "learning_rate": 4.943264889626871e-05, "loss": 0.009133076295256615, "num_tokens": 17336243.0, "reward": 1.02734375, "reward_std": 0.1788254678249359, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.02295756671163771, "sampling/importance_sampling_ratio/max": 2.990734577178955, "sampling/importance_sampling_ratio/mean": 0.958708643913269, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.196749687194824, "sampling/sampling_logp_difference/mean": 0.18489937484264374, "step": 116, "step_time": 175.12587342062034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3266.0, "completions/mean_length": 867.921875, "completions/mean_terminated_length": 763.790283203125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7263264954090118, "epoch": 0.2881773399014778, "frac_reward_zero_std": 0.0, "grad_norm": 0.016397730317110865, "kl": 0.002325467416085303, "learning_rate": 4.942232662419324e-05, "loss": -0.049474526196718216, "num_tokens": 17484766.0, "reward": 1.03515625, "reward_std": 0.5453031659126282, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.08766606450080872, "sampling/importance_sampling_ratio/max": 2.9991261959075928, "sampling/importance_sampling_ratio/mean": 0.951899528503418, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.624442100524902, "sampling/sampling_logp_difference/mean": 0.1993023008108139, "step": 117, "step_time": 189.81664845184423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3429.0, "completions/mean_length": 854.703125, "completions/mean_terminated_length": 803.2540283203125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6976221352815628, "epoch": 0.29064039408866993, "frac_reward_zero_std": 0.5, "grad_norm": 0.004327473408132298, "kl": 0.0014568110054824501, "learning_rate": 4.941191239339158e-05, "loss": 0.019404802471399307, "num_tokens": 17624443.0, "reward": 1.1171875, "reward_std": 0.4130047559738159, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.060594505733913846, "sampling/importance_sampling_ratio/max": 2.9980294704437256, "sampling/importance_sampling_ratio/mean": 0.9605697989463806, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.723148345947266, "sampling/sampling_logp_difference/mean": 0.18460223078727722, "step": 118, "step_time": 159.6808209202718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 1359.78125, "completions/mean_terminated_length": 1177.36669921875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "degenerate_groups_filtered": 0.0, "entropy": 0.62087182700634, "epoch": 0.29310344827586204, "frac_reward_zero_std": 0.25, "grad_norm": 0.00276605032209502, "kl": 0.0018352215993218124, "learning_rate": 4.9401406243077e-05, "loss": 0.0054818070493638515, "num_tokens": 17806925.0, "reward": 1.21484375, "reward_std": 0.5134446024894714, "rewards/reward_func/mean": 0.1349826388888889, "rewards/reward_func/std": 0.07329034474160936, "sampling/importance_sampling_ratio/max": 2.9989259243011475, "sampling/importance_sampling_ratio/mean": 0.9490303993225098, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.999980926513672, "sampling/sampling_logp_difference/mean": 0.19710107147693634, "step": 119, "step_time": 167.16362278792076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 1254.90625, "completions/mean_terminated_length": 1209.8095703125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7221937924623489, "epoch": 0.2955665024630542, "frac_reward_zero_std": 0.75, "grad_norm": 0.00043815358357884285, "kl": 0.0011200245935469866, "learning_rate": 4.939080821280889e-05, "loss": 0.0008629357325844467, "num_tokens": 17982919.0, "reward": 1.16796875, "reward_std": 0.38331958651542664, "rewards/reward_func/mean": 0.12977430555555555, "rewards/reward_func/std": 0.04572268989351061, "sampling/importance_sampling_ratio/max": 2.9984426498413086, "sampling/importance_sampling_ratio/mean": 0.9429291486740112, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.374656677246094, "sampling/sampling_logp_difference/mean": 0.21816638112068176, "step": 120, "step_time": 134.45339812664315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2754.0, "completions/max_terminated_length": 2754.0, "completions/mean_length": 913.25, "completions/mean_terminated_length": 913.25, "completions/min_length": 320.0, "completions/min_terminated_length": 320.0, "degenerate_groups_filtered": 0.0, "entropy": 0.709711492061615, "epoch": 0.29802955665024633, "frac_reward_zero_std": 0.25, "grad_norm": 0.0036506817753482997, "kl": 0.0010524207900743932, "learning_rate": 4.9380118342492596e-05, "loss": 0.029409902170300484, "num_tokens": 18122679.0, "reward": 1.00390625, "reward_std": 0.28692469000816345, "rewards/reward_func/mean": 0.1115451388888889, "rewards/reward_func/std": 0.04380870693259769, "sampling/importance_sampling_ratio/max": 2.9977598190307617, "sampling/importance_sampling_ratio/mean": 0.9520186185836792, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.124889373779297, "sampling/sampling_logp_difference/mean": 0.20045886933803558, "step": 121, "step_time": 87.02172928815708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3203.0, "completions/max_terminated_length": 3203.0, "completions/mean_length": 836.3125, "completions/mean_terminated_length": 838.857177734375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "degenerate_groups_filtered": 0.0, "entropy": 0.693224161863327, "epoch": 0.30049261083743845, "frac_reward_zero_std": 0.0, "grad_norm": 0.0030154542151378852, "kl": 0.001927026896737516, "learning_rate": 4.936933667237926e-05, "loss": 0.014132981188595295, "num_tokens": 18259051.0, "reward": 1.05078125, "reward_std": 0.34840819239616394, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.05086437861124674, "sampling/importance_sampling_ratio/max": 2.9976906776428223, "sampling/importance_sampling_ratio/mean": 0.9550529718399048, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.559123992919922, "sampling/sampling_logp_difference/mean": 0.18455424904823303, "step": 122, "step_time": 106.16042562597431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2646.0, "completions/mean_length": 979.375, "completions/mean_terminated_length": 826.0983276367188, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6227066367864609, "epoch": 0.30295566502463056, "frac_reward_zero_std": 0.5, "grad_norm": 0.003676565230234005, "kl": 0.00129530526464805, "learning_rate": 4.935846324306571e-05, "loss": 0.023714786395430565, "num_tokens": 18408915.0, "reward": 0.9921875, "reward_std": 0.2557619512081146, "rewards/reward_func/mean": 0.11024305555555555, "rewards/reward_func/std": 0.03941734631856283, "sampling/importance_sampling_ratio/max": 2.996018648147583, "sampling/importance_sampling_ratio/mean": 0.9630005359649658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 23.409278869628906, "sampling/sampling_logp_difference/mean": 0.1686955690383911, "step": 123, "step_time": 137.6973474638071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 865.578125, "completions/mean_terminated_length": 814.3016357421875, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7077045142650604, "epoch": 0.3054187192118227, "frac_reward_zero_std": 0.75, "grad_norm": 0.0019030142687087924, "kl": 0.0012345675058895722, "learning_rate": 4.934749809549427e-05, "loss": -8.280670590465888e-05, "num_tokens": 18543704.0, "reward": 1.015625, "reward_std": 0.21764887869358063, "rewards/reward_func/mean": 0.11284722222222222, "rewards/reward_func/std": 0.03337423337830438, "sampling/importance_sampling_ratio/max": 2.9979424476623535, "sampling/importance_sampling_ratio/mean": 0.9609706997871399, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.68747329711914, "sampling/sampling_logp_difference/mean": 0.1729896515607834, "step": 124, "step_time": 119.92689338512719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3934.0, "completions/mean_length": 1081.171875, "completions/mean_terminated_length": 989.0491333007812, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8372175991535187, "epoch": 0.3078817733990148, "frac_reward_zero_std": 0.0, "grad_norm": 0.0045321344703967556, "kl": 0.0017608296184334904, "learning_rate": 4.9336441270952595e-05, "loss": -0.025809479877352715, "num_tokens": 18719891.0, "reward": 1.05078125, "reward_std": 0.4821818768978119, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.07227780090437995, "sampling/importance_sampling_ratio/max": 2.9974734783172607, "sampling/importance_sampling_ratio/mean": 0.9361357688903809, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.645075798034668, "sampling/sampling_logp_difference/mean": 0.24367718398571014, "step": 125, "step_time": 135.73762777121738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1891.0, "completions/mean_length": 677.125, "completions/mean_terminated_length": 622.857177734375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7165095210075378, "epoch": 0.3103448275862069, "frac_reward_zero_std": 0.25, "grad_norm": 0.004739101792306716, "kl": 0.001693987287580967, "learning_rate": 4.932529281107355e-05, "loss": 0.0007048757979646325, "num_tokens": 18847387.0, "reward": 1.1171875, "reward_std": 0.3830971121788025, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.053545390566190086, "sampling/importance_sampling_ratio/max": 2.9993577003479004, "sampling/importance_sampling_ratio/mean": 0.9612326622009277, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.749794006347656, "sampling/sampling_logp_difference/mean": 0.18317507207393646, "step": 126, "step_time": 112.82818150427192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3864.0, "completions/mean_length": 1438.4375, "completions/mean_terminated_length": 1213.2203369140625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6391656994819641, "epoch": 0.312807881773399, "frac_reward_zero_std": 0.5, "grad_norm": 0.007431714168948687, "kl": 0.0014243643818190321, "learning_rate": 4.931405275783507e-05, "loss": 0.019251395016908646, "num_tokens": 19034471.0, "reward": 1.02734375, "reward_std": 0.3566744029521942, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.053882877031962075, "sampling/importance_sampling_ratio/max": 2.9972875118255615, "sampling/importance_sampling_ratio/mean": 0.9524862766265869, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.582551002502441, "sampling/sampling_logp_difference/mean": 0.18535523116588593, "step": 127, "step_time": 152.86888752900995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3641.0, "completions/mean_length": 992.078125, "completions/mean_terminated_length": 942.8095703125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7497861087322235, "epoch": 0.31527093596059114, "frac_reward_zero_std": 0.25, "grad_norm": 0.0030525830028834324, "kl": 0.0023082339903339744, "learning_rate": 4.930272115355992e-05, "loss": -0.015048885717988014, "num_tokens": 19181260.0, "reward": 1.28515625, "reward_std": 0.556111752986908, "rewards/reward_func/mean": 0.1427951388888889, "rewards/reward_func/std": 0.07730624907546574, "sampling/importance_sampling_ratio/max": 2.9983325004577637, "sampling/importance_sampling_ratio/mean": 0.9502414464950562, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.081714630126953, "sampling/sampling_logp_difference/mean": 0.21059447526931763, "step": 128, "step_time": 121.74499881407246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3749.0, "completions/max_terminated_length": 3749.0, "completions/mean_length": 1172.21875, "completions/mean_terminated_length": 1165.761962890625, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7381231337785721, "epoch": 0.31773399014778325, "frac_reward_zero_std": 0.25, "grad_norm": 0.007650712047425518, "kl": 0.001185288158012554, "learning_rate": 4.929129804091562e-05, "loss": 0.06440460681915283, "num_tokens": 19359098.0, "reward": 1.2890625, "reward_std": 0.7046032547950745, "rewards/reward_func/mean": 0.14322916666666666, "rewards/reward_func/std": 0.12921956926584244, "sampling/importance_sampling_ratio/max": 2.998009204864502, "sampling/importance_sampling_ratio/mean": 0.944693386554718, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.996122360229492, "sampling/sampling_logp_difference/mean": 0.2195536196231842, "step": 129, "step_time": 115.53860899922438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3714.0, "completions/max_terminated_length": 3714.0, "completions/mean_length": 464.375, "completions/mean_terminated_length": 471.7301940917969, "completions/min_length": 1.0, "completions/min_terminated_length": 62.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6270382106304169, "epoch": 0.32019704433497537, "frac_reward_zero_std": 0.25, "grad_norm": 0.004284817811709977, "kl": 0.00273809939972125, "learning_rate": 4.927978346291424e-05, "loss": -0.030350536108016968, "num_tokens": 19451570.0, "reward": 1.04296875, "reward_std": 0.37679383158683777, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.05647122197681003, "sampling/importance_sampling_ratio/max": 2.9892261028289795, "sampling/importance_sampling_ratio/mean": 0.9704676270484924, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.899042129516602, "sampling/sampling_logp_difference/mean": 0.15999342501163483, "step": 130, "step_time": 99.42007652996108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2876.0, "completions/mean_length": 1104.359375, "completions/mean_terminated_length": 957.2294311523438, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6907539814710617, "epoch": 0.3226600985221675, "frac_reward_zero_std": 0.5, "grad_norm": 0.0029258177695383855, "kl": 0.001306317833950743, "learning_rate": 4.9268177462912255e-05, "loss": -0.0022134785540401936, "num_tokens": 19621513.0, "reward": 0.984375, "reward_std": 0.3238992393016815, "rewards/reward_func/mean": 0.109375, "rewards/reward_func/std": 0.049388562639554344, "sampling/importance_sampling_ratio/max": 2.9986183643341064, "sampling/importance_sampling_ratio/mean": 0.9501324892044067, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.441189765930176, "sampling/sampling_logp_difference/mean": 0.20469427108764648, "step": 131, "step_time": 151.08437192393467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3990.0, "completions/mean_length": 1114.28125, "completions/mean_terminated_length": 970.050048828125, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7603419423103333, "epoch": 0.3251231527093596, "frac_reward_zero_std": 0.0, "grad_norm": 0.002124359777457952, "kl": 0.0012819624680560082, "learning_rate": 4.9256480084610376e-05, "loss": -0.002804091200232506, "num_tokens": 19780075.0, "reward": 1.04296875, "reward_std": 0.34947434067726135, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.053156735168562994, "sampling/importance_sampling_ratio/max": 2.9991204738616943, "sampling/importance_sampling_ratio/mean": 0.9497889876365662, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.665460586547852, "sampling/sampling_logp_difference/mean": 0.20453506708145142, "step": 132, "step_time": 129.8263506561052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3803.0, "completions/mean_length": 986.703125, "completions/mean_terminated_length": 914.131103515625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6631338149309158, "epoch": 0.3275862068965517, "frac_reward_zero_std": 0.25, "grad_norm": 0.001555458462487788, "kl": 0.0020409352728165686, "learning_rate": 4.9244691372053376e-05, "loss": -0.024803729727864265, "num_tokens": 19921752.0, "reward": 1.03515625, "reward_std": 0.3359043300151825, "rewards/reward_func/mean": 0.1150173611111111, "rewards/reward_func/std": 0.050191783242755465, "sampling/importance_sampling_ratio/max": 2.9955570697784424, "sampling/importance_sampling_ratio/mean": 0.9567161202430725, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.687455177307129, "sampling/sampling_logp_difference/mean": 0.18763144314289093, "step": 133, "step_time": 125.27077388390899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3702.0, "completions/mean_length": 1057.578125, "completions/mean_terminated_length": 1009.3492431640625, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7682660669088364, "epoch": 0.33004926108374383, "frac_reward_zero_std": 0.5, "grad_norm": 0.009635775481836006, "kl": 0.0018410422198940068, "learning_rate": 4.9232811369629936e-05, "loss": 0.06454427540302277, "num_tokens": 20080669.0, "reward": 1.328125, "reward_std": 0.8139105439186096, "rewards/reward_func/mean": 0.14756944444444445, "rewards/reward_func/std": 0.12541627056068844, "sampling/importance_sampling_ratio/max": 2.997227430343628, "sampling/importance_sampling_ratio/mean": 0.9470077157020569, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.934889793395996, "sampling/sampling_logp_difference/mean": 0.21396198868751526, "step": 134, "step_time": 129.73883415712044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3652.0, "completions/mean_length": 1508.015625, "completions/mean_terminated_length": 1259.2982177734375, "completions/min_length": 74.0, "completions/min_terminated_length": 74.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6884257197380066, "epoch": 0.33251231527093594, "frac_reward_zero_std": 0.5, "grad_norm": 0.0030280563055380292, "kl": 0.002304157940670848, "learning_rate": 4.9220840122072495e-05, "loss": 0.007324616890400648, "num_tokens": 20260622.0, "reward": 1.0546875, "reward_std": 0.3068941533565521, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.044668421149253845, "sampling/importance_sampling_ratio/max": 2.999495029449463, "sampling/importance_sampling_ratio/mean": 0.9472457766532898, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.176748275756836, "sampling/sampling_logp_difference/mean": 0.20310401916503906, "step": 135, "step_time": 131.45040617790073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3621.0, "completions/mean_length": 1437.515625, "completions/mean_terminated_length": 1059.5, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7024633884429932, "epoch": 0.33497536945812806, "frac_reward_zero_std": 0.0, "grad_norm": 0.0020426303305261725, "kl": 0.0019914144941139966, "learning_rate": 4.920877767445705e-05, "loss": -0.009508827701210976, "num_tokens": 20449311.0, "reward": 1.1171875, "reward_std": 0.4858488440513611, "rewards/reward_func/mean": 0.12413194444444445, "rewards/reward_func/std": 0.06965038345919715, "sampling/importance_sampling_ratio/max": 2.997471809387207, "sampling/importance_sampling_ratio/mean": 0.9443151950836182, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.433490753173828, "sampling/sampling_logp_difference/mean": 0.21754810214042664, "step": 136, "step_time": 137.67270130012184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3679.0, "completions/mean_length": 1181.625, "completions/mean_terminated_length": 987.3333740234375, "completions/min_length": 339.0, "completions/min_terminated_length": 339.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7879326194524765, "epoch": 0.3374384236453202, "frac_reward_zero_std": 0.75, "grad_norm": 0.0020936069842209834, "kl": 0.001667340548010543, "learning_rate": 4.919662407220299e-05, "loss": 0.006686965469270945, "num_tokens": 20614039.0, "reward": 1.09375, "reward_std": 0.3435921370983124, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.04884182744556003, "sampling/importance_sampling_ratio/max": 2.9936232566833496, "sampling/importance_sampling_ratio/mean": 0.9455877542495728, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.05931282043457, "sampling/sampling_logp_difference/mean": 0.21757498383522034, "step": 137, "step_time": 129.50531403324567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2448.0, "completions/mean_length": 1257.078125, "completions/mean_terminated_length": 1152.0655517578125, "completions/min_length": 300.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.666120707988739, "epoch": 0.3399014778325123, "frac_reward_zero_std": 0.25, "grad_norm": 0.0015588500520508758, "kl": 0.001390039746183902, "learning_rate": 4.918437936107293e-05, "loss": -0.0016076350584626198, "num_tokens": 20793404.0, "reward": 1.140625, "reward_std": 0.41755858063697815, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.05750518043835958, "sampling/importance_sampling_ratio/max": 2.994096517562866, "sampling/importance_sampling_ratio/mean": 0.9516454339027405, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.36985206604004, "sampling/sampling_logp_difference/mean": 0.19270411133766174, "step": 138, "step_time": 128.2147407066077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2123.0, "completions/mean_length": 1000.9375, "completions/mean_terminated_length": 810.559326171875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7744076550006866, "epoch": 0.34236453201970446, "frac_reward_zero_std": 0.25, "grad_norm": 0.001767253547338134, "kl": 0.0015895857068244368, "learning_rate": 4.9172043587172564e-05, "loss": -0.018749739974737167, "num_tokens": 20938440.0, "reward": 1.02734375, "reward_std": 0.2991959750652313, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.04486183987723456, "sampling/importance_sampling_ratio/max": 2.9982097148895264, "sampling/importance_sampling_ratio/mean": 0.9536800384521484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.281064987182617, "sampling/sampling_logp_difference/mean": 0.20259705185890198, "step": 139, "step_time": 182.23772311606444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3620.0, "completions/mean_length": 1145.375, "completions/mean_terminated_length": 1098.539794921875, "completions/min_length": 287.0, "completions/min_terminated_length": 287.0, "degenerate_groups_filtered": 1.0, "entropy": 0.670438826084137, "epoch": 0.3448275862068966, "frac_reward_zero_std": 0.75, "grad_norm": 0.0004879941470283451, "kl": 0.0015858457772992551, "learning_rate": 4.915961679695046e-05, "loss": -0.0004922771477140486, "num_tokens": 21103968.0, "reward": 1.12109375, "reward_std": 0.33627331256866455, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.04050926036304898, "sampling/importance_sampling_ratio/max": 2.9990146160125732, "sampling/importance_sampling_ratio/mean": 0.9465623497962952, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.31145477294922, "sampling/sampling_logp_difference/mean": 0.20568417012691498, "step": 140, "step_time": 119.32564870314673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3744.0, "completions/max_terminated_length": 3744.0, "completions/mean_length": 932.421875, "completions/mean_terminated_length": 932.825439453125, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6941824108362198, "epoch": 0.3472906403940887, "frac_reward_zero_std": 0.25, "grad_norm": 0.0012234709661190007, "kl": 0.0026189969503320754, "learning_rate": 4.914709903719788e-05, "loss": 0.015446648001670837, "num_tokens": 21242299.0, "reward": 1.12109375, "reward_std": 0.3645833432674408, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.04644498642947939, "sampling/importance_sampling_ratio/max": 2.997051954269409, "sampling/importance_sampling_ratio/mean": 0.9586159586906433, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.005863189697266, "sampling/sampling_logp_difference/mean": 0.18264621496200562, "step": 141, "step_time": 141.0135326378513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3652.0, "completions/max_terminated_length": 3652.0, "completions/mean_length": 636.125, "completions/mean_terminated_length": 636.125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7068347334861755, "epoch": 0.3497536945812808, "frac_reward_zero_std": 0.5, "grad_norm": 0.0007979245291869645, "kl": 0.0020473987387958914, "learning_rate": 4.913449035504865e-05, "loss": -0.0025128277484327555, "num_tokens": 21377475.0, "reward": 1.05078125, "reward_std": 0.25268277525901794, "rewards/reward_func/mean": 0.11675347222222222, "rewards/reward_func/std": 0.03302617081337505, "sampling/importance_sampling_ratio/max": 2.999530553817749, "sampling/importance_sampling_ratio/mean": 0.9572858214378357, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.087381362915039, "sampling/sampling_logp_difference/mean": 0.18261964619159698, "step": 142, "step_time": 107.95403501321562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3583.0, "completions/mean_length": 839.75, "completions/mean_terminated_length": 788.758056640625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7018862813711166, "epoch": 0.3522167487684729, "frac_reward_zero_std": 0.5, "grad_norm": 0.002889781139543682, "kl": 0.0021695691102650017, "learning_rate": 4.912179079797892e-05, "loss": 0.0009456706466153264, "num_tokens": 21512947.0, "reward": 1.01953125, "reward_std": 0.2998170256614685, "rewards/reward_func/mean": 0.11328125, "rewards/reward_func/std": 0.04724570612112681, "sampling/importance_sampling_ratio/max": 2.992983102798462, "sampling/importance_sampling_ratio/mean": 0.9585220217704773, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.02216911315918, "sampling/sampling_logp_difference/mean": 0.18149858713150024, "step": 143, "step_time": 129.82384162512608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2424.0, "completions/mean_length": 869.0625, "completions/mean_terminated_length": 782.1451416015625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6957240998744965, "epoch": 0.35467980295566504, "frac_reward_zero_std": 0.0, "grad_norm": 0.001834099632040615, "kl": 0.0014171720249578357, "learning_rate": 4.910900041380703e-05, "loss": -0.017629800364375114, "num_tokens": 21655623.0, "reward": 1.0390625, "reward_std": 0.3249503970146179, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.04804881579346127, "sampling/importance_sampling_ratio/max": 2.988936185836792, "sampling/importance_sampling_ratio/mean": 0.9553591012954712, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.53939437866211, "sampling/sampling_logp_difference/mean": 0.18938414752483368, "step": 144, "step_time": 164.86258218903095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3388.0, "completions/mean_length": 1086.5625, "completions/mean_terminated_length": 975.6229248046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7083845734596252, "epoch": 0.35714285714285715, "frac_reward_zero_std": 0.25, "grad_norm": 0.007941490291629926, "kl": 0.0013565148983616382, "learning_rate": 4.909611925069332e-05, "loss": 0.014829241670668125, "num_tokens": 21809019.0, "reward": 1.2734375, "reward_std": 0.8246195912361145, "rewards/reward_func/mean": 0.14149305555555555, "rewards/reward_func/std": 0.11937192496326235, "sampling/importance_sampling_ratio/max": 2.999521017074585, "sampling/importance_sampling_ratio/mean": 0.9506230354309082, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.645586013793945, "sampling/sampling_logp_difference/mean": 0.20339885354042053, "step": 145, "step_time": 168.08624598686583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3746.0, "completions/mean_length": 1037.734375, "completions/mean_terminated_length": 891.016357421875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7873726338148117, "epoch": 0.35960591133004927, "frac_reward_zero_std": 0.0, "grad_norm": 0.0023097908581655286, "kl": 0.003050926752621308, "learning_rate": 4.9083147357139936e-05, "loss": 0.007973194122314453, "num_tokens": 21970106.0, "reward": 1.08984375, "reward_std": 0.3709896206855774, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.05479054152965546, "sampling/importance_sampling_ratio/max": 2.9961843490600586, "sampling/importance_sampling_ratio/mean": 0.9499616622924805, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.426872253417969, "sampling/sampling_logp_difference/mean": 0.20822127163410187, "step": 146, "step_time": 171.15286494023167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3201.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 1070.609375, "completions/mean_terminated_length": 1070.609375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.639614000916481, "epoch": 0.3620689655172414, "frac_reward_zero_std": 0.75, "grad_norm": 0.0035868637404805775, "kl": 0.0013124027755111456, "learning_rate": 4.9070084781990655e-05, "loss": 0.0779402107000351, "num_tokens": 22122833.0, "reward": 1.27734375, "reward_std": 0.7252766489982605, "rewards/reward_func/mean": 0.14192708333333334, "rewards/reward_func/std": 0.10532407628165351, "sampling/importance_sampling_ratio/max": 2.9976370334625244, "sampling/importance_sampling_ratio/mean": 0.9555914402008057, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.999216079711914, "sampling/sampling_logp_difference/mean": 0.18233312666416168, "step": 147, "step_time": 92.5367358867079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3682.0, "completions/mean_length": 1106.71875, "completions/mean_terminated_length": 1010.290283203125, "completions/min_length": 309.0, "completions/min_terminated_length": 309.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7598123103380203, "epoch": 0.3645320197044335, "frac_reward_zero_std": 0.5, "grad_norm": 0.0006033462364919338, "kl": 0.0015440615534316748, "learning_rate": 4.905693157443072e-05, "loss": -6.69892760924995e-05, "num_tokens": 22281695.0, "reward": 1.1015625, "reward_std": 0.3203382194042206, "rewards/reward_func/mean": 0.12239583333333333, "rewards/reward_func/std": 0.03982427467902502, "sampling/importance_sampling_ratio/max": 2.9990968704223633, "sampling/importance_sampling_ratio/mean": 0.9494574666023254, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.545462608337402, "sampling/sampling_logp_difference/mean": 0.21209058165550232, "step": 148, "step_time": 171.81504101189785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3808.0, "completions/mean_length": 1134.984375, "completions/mean_terminated_length": 1027.7540283203125, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6977289170026779, "epoch": 0.3669950738916256, "frac_reward_zero_std": 0.25, "grad_norm": 0.001889963426633302, "kl": 0.0013170290039852262, "learning_rate": 4.904368778398662e-05, "loss": 0.007757263723760843, "num_tokens": 22433710.0, "reward": 1.0546875, "reward_std": 0.3463779389858246, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.05063716073830923, "sampling/importance_sampling_ratio/max": 2.9935598373413086, "sampling/importance_sampling_ratio/mean": 0.9530324935913086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.762185096740723, "sampling/sampling_logp_difference/mean": 0.18600577116012573, "step": 149, "step_time": 124.45693185203709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 3848.0, "completions/max_terminated_length": 3848.0, "completions/mean_length": 987.8125, "completions/mean_terminated_length": 977.4917602539062, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7574534267187119, "epoch": 0.3694581280788177, "frac_reward_zero_std": 0.25, "grad_norm": 0.002926058083550387, "kl": 0.001403219037456438, "learning_rate": 4.903035346052593e-05, "loss": 0.0012572875712066889, "num_tokens": 22592034.0, "reward": 1.01171875, "reward_std": 0.27251651883125305, "rewards/reward_func/mean": 0.11241319444444445, "rewards/reward_func/std": 0.041424840688705444, "sampling/importance_sampling_ratio/max": 2.9942948818206787, "sampling/importance_sampling_ratio/mean": 0.9399828910827637, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.540233612060547, "sampling/sampling_logp_difference/mean": 0.23285743594169617, "step": 150, "step_time": 125.10149595234543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2618.0, "completions/mean_length": 1055.484375, "completions/mean_terminated_length": 965.016357421875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6921143233776093, "epoch": 0.37192118226600984, "frac_reward_zero_std": 0.75, "grad_norm": 0.0003847117777456799, "kl": 0.001394979510223493, "learning_rate": 4.9016928654257096e-05, "loss": -0.0016071898862719536, "num_tokens": 22765713.0, "reward": 1.07421875, "reward_std": 0.2734251022338867, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.03352663583225674, "sampling/importance_sampling_ratio/max": 2.999713182449341, "sampling/importance_sampling_ratio/mean": 0.9436339139938354, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.68712329864502, "sampling/sampling_logp_difference/mean": 0.21123811602592468, "step": 151, "step_time": 180.94982343283482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2617.0, "completions/mean_length": 864.890625, "completions/mean_terminated_length": 813.6032104492188, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7735969722270966, "epoch": 0.37438423645320196, "frac_reward_zero_std": 0.5, "grad_norm": 0.0005504224354699446, "kl": 0.001623089803615585, "learning_rate": 4.9003413415729295e-05, "loss": -0.0025445527862757444, "num_tokens": 22903450.0, "reward": 1.1328125, "reward_std": 0.35626131296157837, "rewards/reward_func/mean": 0.12586805555555555, "rewards/reward_func/std": 0.043802719149324626, "sampling/importance_sampling_ratio/max": 2.998650312423706, "sampling/importance_sampling_ratio/mean": 0.9493921995162964, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.77414608001709, "sampling/sampling_logp_difference/mean": 0.21335291862487793, "step": 152, "step_time": 125.38044445589185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2847.0, "completions/mean_length": 1196.296875, "completions/mean_terminated_length": 1143.4031982421875, "completions/min_length": 56.0, "completions/min_terminated_length": 56.0, "degenerate_groups_filtered": 1.0, "entropy": 0.741541400551796, "epoch": 0.3768472906403941, "frac_reward_zero_std": 0.5, "grad_norm": 0.0024813468622562054, "kl": 0.0012266744452062994, "learning_rate": 4.898980779583218e-05, "loss": -0.013216846622526646, "num_tokens": 23078717.0, "reward": 1.140625, "reward_std": 0.4151759147644043, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.05974882344404856, "sampling/importance_sampling_ratio/max": 2.998521327972412, "sampling/importance_sampling_ratio/mean": 0.9421178102493286, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.311586380004883, "sampling/sampling_logp_difference/mean": 0.2195492386817932, "step": 153, "step_time": 141.04792174184695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3065.0, "completions/max_terminated_length": 3065.0, "completions/mean_length": 1088.75, "completions/mean_terminated_length": 1098.508056640625, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7633115500211716, "epoch": 0.3793103448275862, "frac_reward_zero_std": 0.5, "grad_norm": 0.004701491570482404, "kl": 0.0012460256984923035, "learning_rate": 4.897611184579575e-05, "loss": 0.05346252769231796, "num_tokens": 23238861.0, "reward": 1.29296875, "reward_std": 0.7299634218215942, "rewards/reward_func/mean": 0.14366319444444445, "rewards/reward_func/std": 0.1064673662185669, "sampling/importance_sampling_ratio/max": 2.9990603923797607, "sampling/importance_sampling_ratio/mean": 0.9426702857017517, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.55482292175293, "sampling/sampling_logp_difference/mean": 0.22415803372859955, "step": 154, "step_time": 100.15680134599097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2150.0, "completions/max_terminated_length": 2150.0, "completions/mean_length": 798.84375, "completions/mean_terminated_length": 798.84375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7754421979188919, "epoch": 0.3817733990147783, "frac_reward_zero_std": 0.75, "grad_norm": 0.0007325242776395226, "kl": 0.00122224964434281, "learning_rate": 4.896232561719011e-05, "loss": 0.0007550335139967501, "num_tokens": 23365331.0, "reward": 1.12109375, "reward_std": 0.33627331256866455, "rewards/reward_func/mean": 0.12456597222222222, "rewards/reward_func/std": 0.04050926036304898, "sampling/importance_sampling_ratio/max": 2.996500015258789, "sampling/importance_sampling_ratio/mean": 0.9564509391784668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.966944694519043, "sampling/sampling_logp_difference/mean": 0.1989564299583435, "step": 155, "step_time": 84.85235846112482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3084.0, "completions/mean_length": 1392.203125, "completions/mean_terminated_length": 1212.5762939453125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6894625276327133, "epoch": 0.3842364532019704, "frac_reward_zero_std": 0.25, "grad_norm": 0.0007756937710466033, "kl": 0.0011820744257420301, "learning_rate": 4.8948449161925304e-05, "loss": -0.0026572286151349545, "num_tokens": 23533808.0, "reward": 1.16015625, "reward_std": 0.3892585337162018, "rewards/reward_func/mean": 0.12890625, "rewards/reward_func/std": 0.04816830199625757, "sampling/importance_sampling_ratio/max": 2.997208833694458, "sampling/importance_sampling_ratio/mean": 0.9468032121658325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.121822357177734, "sampling/sampling_logp_difference/mean": 0.20983976125717163, "step": 156, "step_time": 161.84404824208468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3994.0, "completions/mean_length": 1185.328125, "completions/mean_terminated_length": 1042.1802978515625, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6832298785448074, "epoch": 0.3866995073891626, "frac_reward_zero_std": 0.0, "grad_norm": 0.0017762955728373934, "kl": 0.0010904025693889707, "learning_rate": 4.893448253225111e-05, "loss": 0.003249376080930233, "num_tokens": 23689877.0, "reward": 1.046875, "reward_std": 0.31140682101249695, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.04530912637710571, "sampling/importance_sampling_ratio/max": 2.9998233318328857, "sampling/importance_sampling_ratio/mean": 0.95380699634552, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.548775672912598, "sampling/sampling_logp_difference/mean": 0.18981406092643738, "step": 157, "step_time": 126.51651544007473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3857.0, "completions/mean_length": 1334.03125, "completions/mean_terminated_length": 1290.1905517578125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 1.0, "entropy": 0.8318304419517517, "epoch": 0.3891625615763547, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006476170158019754, "kl": 0.0011377888440620154, "learning_rate": 4.892042578075685e-05, "loss": 0.002958628349006176, "num_tokens": 23868615.0, "reward": 1.1796875, "reward_std": 0.38951727747917175, "rewards/reward_func/mean": 0.1310763888888889, "rewards/reward_func/std": 0.048582213620344795, "sampling/importance_sampling_ratio/max": 2.9970788955688477, "sampling/importance_sampling_ratio/mean": 0.9396538734436035, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.113051414489746, "sampling/sampling_logp_difference/mean": 0.2308649867773056, "step": 158, "step_time": 133.94302169582807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3591.0, "completions/mean_length": 1060.359375, "completions/mean_terminated_length": 941.6834106445312, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6752626597881317, "epoch": 0.3916256157635468, "frac_reward_zero_std": 0.5, "grad_norm": 0.001742199637816057, "kl": 0.0013848040252923965, "learning_rate": 4.8906278960371176e-05, "loss": 0.004247048869729042, "num_tokens": 24020910.0, "reward": 1.109375, "reward_std": 0.3417827785015106, "rewards/reward_func/mean": 0.1232638888888889, "rewards/reward_func/std": 0.04796475751532449, "sampling/importance_sampling_ratio/max": 2.994645357131958, "sampling/importance_sampling_ratio/mean": 0.9534465074539185, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.727479934692383, "sampling/sampling_logp_difference/mean": 0.196367084980011, "step": 159, "step_time": 114.65341229783371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3619.0, "completions/mean_length": 1137.125, "completions/mean_terminated_length": 1041.6773681640625, "completions/min_length": 235.0, "completions/min_terminated_length": 235.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7267605811357498, "epoch": 0.39408866995073893, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016509462080566294, "kl": 0.0013555605837609619, "learning_rate": 4.889204212436189e-05, "loss": 0.003579859621822834, "num_tokens": 24188486.0, "reward": 1.02734375, "reward_std": 0.24035847187042236, "rewards/reward_func/mean": 0.11414930555555555, "rewards/reward_func/std": 0.03507047891616821, "sampling/importance_sampling_ratio/max": 2.99871563911438, "sampling/importance_sampling_ratio/mean": 0.9451822638511658, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.253337860107422, "sampling/sampling_logp_difference/mean": 0.21290577948093414, "step": 160, "step_time": 127.22938701603562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3007.0, "completions/mean_length": 1078.546875, "completions/mean_terminated_length": 989.4425659179688, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7306022793054581, "epoch": 0.39655172413793105, "frac_reward_zero_std": 0.75, "grad_norm": 0.004243209978439369, "kl": 0.001163211651146412, "learning_rate": 4.8877715326335735e-05, "loss": 0.04088365659117699, "num_tokens": 24345497.0, "reward": 1.1875, "reward_std": 0.6871842741966248, "rewards/reward_func/mean": 0.13194444444444445, "rewards/reward_func/std": 0.10439738300111559, "sampling/importance_sampling_ratio/max": 2.997098922729492, "sampling/importance_sampling_ratio/mean": 0.9458619356155396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.110089302062988, "sampling/sampling_logp_difference/mean": 0.20933616161346436, "step": 161, "step_time": 176.95277623389848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2682.0, "completions/mean_length": 613.265625, "completions/mean_terminated_length": 557.984130859375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6799998581409454, "epoch": 0.39901477832512317, "frac_reward_zero_std": 0.0, "grad_norm": 0.0033757850857375773, "kl": 0.002622713363962248, "learning_rate": 4.886329862023818e-05, "loss": -0.01957223378121853, "num_tokens": 24457466.0, "reward": 1.015625, "reward_std": 0.3532024621963501, "rewards/reward_func/mean": 0.11284722222222222, "rewards/reward_func/std": 0.05273487501674228, "sampling/importance_sampling_ratio/max": 2.9935693740844727, "sampling/importance_sampling_ratio/mean": 0.9666866064071655, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.023531913757324, "sampling/sampling_logp_difference/mean": 0.16631880402565002, "step": 162, "step_time": 128.8812639042735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3618.0, "completions/mean_length": 785.078125, "completions/mean_terminated_length": 678.274169921875, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6376421004533768, "epoch": 0.4014778325123153, "frac_reward_zero_std": 0.5, "grad_norm": 0.0005021024269253694, "kl": 0.001497179502621293, "learning_rate": 4.884879206035324e-05, "loss": -0.000561786990147084, "num_tokens": 24586111.0, "reward": 1.0234375, "reward_std": 0.18213215470314026, "rewards/reward_func/mean": 0.11371527777777778, "rewards/reward_func/std": 0.02435668061176936, "sampling/importance_sampling_ratio/max": 2.993292808532715, "sampling/importance_sampling_ratio/mean": 0.9647999405860901, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.675369262695312, "sampling/sampling_logp_difference/mean": 0.15786173939704895, "step": 163, "step_time": 180.46688992506824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3760.0, "completions/max_terminated_length": 3760.0, "completions/mean_length": 726.25, "completions/mean_terminated_length": 731.6032104492188, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6761015057563782, "epoch": 0.4039408866995074, "frac_reward_zero_std": 0.25, "grad_norm": 0.0021193524478002454, "kl": 0.0015740302333142608, "learning_rate": 4.883419570130327e-05, "loss": -0.017717270180583, "num_tokens": 24721503.0, "reward": 1.11328125, "reward_std": 0.4247541129589081, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.060799873537487455, "sampling/importance_sampling_ratio/max": 2.9982118606567383, "sampling/importance_sampling_ratio/mean": 0.9577165842056274, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.686821937561035, "sampling/sampling_logp_difference/mean": 0.17808575928211212, "step": 164, "step_time": 120.17617022292688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3677.0, "completions/mean_length": 1376.09375, "completions/mean_terminated_length": 1288.3548583984375, "completions/min_length": 313.0, "completions/min_terminated_length": 313.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7354206442832947, "epoch": 0.4064039408866995, "frac_reward_zero_std": 0.5, "grad_norm": 0.0008399744445777652, "kl": 0.0009593560826033354, "learning_rate": 4.881950959804874e-05, "loss": -0.0010941341752186418, "num_tokens": 24899525.0, "reward": 1.0546875, "reward_std": 0.29703810811042786, "rewards/reward_func/mean": 0.1171875, "rewards/reward_func/std": 0.04189008225997289, "sampling/importance_sampling_ratio/max": 2.9984233379364014, "sampling/importance_sampling_ratio/mean": 0.9466732740402222, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.314602851867676, "sampling/sampling_logp_difference/mean": 0.2059394121170044, "step": 165, "step_time": 174.922209485434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 842.625, "completions/mean_terminated_length": 737.6773681640625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7169460654258728, "epoch": 0.4088669950738916, "frac_reward_zero_std": 0.25, "grad_norm": 0.0066154200174927385, "kl": 0.0014329378900583833, "learning_rate": 4.8804733805888024e-05, "loss": 0.002477045636624098, "num_tokens": 25029053.0, "reward": 1.11328125, "reward_std": 0.4606105387210846, "rewards/reward_func/mean": 0.12369791666666667, "rewards/reward_func/std": 0.07763891460167037, "sampling/importance_sampling_ratio/max": 2.997968912124634, "sampling/importance_sampling_ratio/mean": 0.9603346586227417, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.809807777404785, "sampling/sampling_logp_difference/mean": 0.18966203927993774, "step": 166, "step_time": 116.80163077195175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3102.0, "completions/mean_length": 1133.796875, "completions/mean_terminated_length": 988.1146850585938, "completions/min_length": 241.0, "completions/min_terminated_length": 241.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7492803931236267, "epoch": 0.41133004926108374, "frac_reward_zero_std": 0.5, "grad_norm": 0.0032851832185571083, "kl": 0.001638269837712869, "learning_rate": 4.8789868380457246e-05, "loss": 0.040263283997774124, "num_tokens": 25182896.0, "reward": 1.18359375, "reward_std": 0.6889752745628357, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.09606481591860454, "sampling/importance_sampling_ratio/max": 2.99403715133667, "sampling/importance_sampling_ratio/mean": 0.9468588829040527, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.79487133026123, "sampling/sampling_logp_difference/mean": 0.20979991555213928, "step": 167, "step_time": 133.14356829575263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4063.0, "completions/mean_length": 911.078125, "completions/mean_terminated_length": 824.11669921875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7148732990026474, "epoch": 0.41379310344827586, "frac_reward_zero_std": 0.25, "grad_norm": 0.0007135211241037537, "kl": 0.0016217580414377153, "learning_rate": 4.8774913377729994e-05, "loss": -0.01211149524897337, "num_tokens": 25323061.0, "reward": 1.0390625, "reward_std": 0.27174752950668335, "rewards/reward_func/mean": 0.1154513888888889, "rewards/reward_func/std": 0.038944005138344236, "sampling/importance_sampling_ratio/max": 2.9920668601989746, "sampling/importance_sampling_ratio/mean": 0.9543583393096924, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.914791107177734, "sampling/sampling_logp_difference/mean": 0.19774362444877625, "step": 168, "step_time": 131.8124701383058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3150.0, "completions/mean_length": 903.328125, "completions/mean_terminated_length": 858.4031982421875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7233236730098724, "epoch": 0.41625615763546797, "frac_reward_zero_std": 0.25, "grad_norm": 0.008541288810641963, "kl": 0.001499377453001216, "learning_rate": 4.875986885401717e-05, "loss": 0.11034619063138962, "num_tokens": 25464666.0, "reward": 1.234375, "reward_std": 0.8795234560966492, "rewards/reward_func/mean": 0.1371527777777778, "rewards/reward_func/std": 0.1389351338148117, "sampling/importance_sampling_ratio/max": 2.998958110809326, "sampling/importance_sampling_ratio/mean": 0.9543308615684509, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.080719947814941, "sampling/sampling_logp_difference/mean": 0.19305600225925446, "step": 169, "step_time": 124.94637063192204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3017.0, "completions/mean_length": 1323.953125, "completions/mean_terminated_length": 1151.1016845703125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7630322575569153, "epoch": 0.4187192118226601, "frac_reward_zero_std": 0.25, "grad_norm": 0.01083484242329808, "kl": 0.0010297257540514693, "learning_rate": 4.874473486596672e-05, "loss": -0.028323577716946602, "num_tokens": 25652839.0, "reward": 1.140625, "reward_std": 0.6040127277374268, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.1057632068792979, "sampling/importance_sampling_ratio/max": 2.9976863861083984, "sampling/importance_sampling_ratio/mean": 0.9415475726127625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.453205108642578, "sampling/sampling_logp_difference/mean": 0.2272610068321228, "step": 170, "step_time": 133.1720853582956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2148.0, "completions/mean_length": 935.453125, "completions/mean_terminated_length": 885.2857666015625, "completions/min_length": 233.0, "completions/min_terminated_length": 233.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7636792063713074, "epoch": 0.4211822660098522, "frac_reward_zero_std": 0.5, "grad_norm": 0.0028875010309662576, "kl": 0.0013721904251724482, "learning_rate": 4.8729511470563514e-05, "loss": 0.0062779695726931095, "num_tokens": 25809700.0, "reward": 1.09375, "reward_std": 0.33481812477111816, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.045880657931168876, "sampling/importance_sampling_ratio/max": 2.9988179206848145, "sampling/importance_sampling_ratio/mean": 0.9457703828811646, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.243955612182617, "sampling/sampling_logp_difference/mean": 0.22251084446907043, "step": 171, "step_time": 123.20627958187833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3317.0, "completions/max_terminated_length": 3317.0, "completions/mean_length": 950.453125, "completions/mean_terminated_length": 956.4286499023438, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6344201117753983, "epoch": 0.4236453201970443, "frac_reward_zero_std": 0.25, "grad_norm": 0.0010320451728027538, "kl": 0.0010279046691721305, "learning_rate": 4.871419872512901e-05, "loss": 0.010371055454015732, "num_tokens": 25955393.0, "reward": 1.12890625, "reward_std": 0.34787389636039734, "rewards/reward_func/mean": 0.1254340277777778, "rewards/reward_func/std": 0.044849217351939946, "sampling/importance_sampling_ratio/max": 2.9972174167633057, "sampling/importance_sampling_ratio/mean": 0.9531108140945435, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.62364387512207, "sampling/sampling_logp_difference/mean": 0.1808897852897644, "step": 172, "step_time": 121.73163252789527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3917.0, "completions/max_terminated_length": 3917.0, "completions/mean_length": 753.5625, "completions/mean_terminated_length": 761.4603881835938, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7194567173719406, "epoch": 0.42610837438423643, "frac_reward_zero_std": 0.25, "grad_norm": 0.009213834449335897, "kl": 0.0021812406193930656, "learning_rate": 4.869879668732115e-05, "loss": -0.011229299008846283, "num_tokens": 26083653.0, "reward": 0.98828125, "reward_std": 0.3131689429283142, "rewards/reward_func/mean": 0.10980902777777778, "rewards/reward_func/std": 0.04778718948364258, "sampling/importance_sampling_ratio/max": 2.999462127685547, "sampling/importance_sampling_ratio/mean": 0.9515129327774048, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.498061180114746, "sampling/sampling_logp_difference/mean": 0.19364528357982635, "step": 173, "step_time": 141.7643264059443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3441.0, "completions/mean_length": 1269.9375, "completions/mean_terminated_length": 977.586181640625, "completions/min_length": 269.0, "completions/min_terminated_length": 269.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7887077778577805, "epoch": 0.42857142857142855, "frac_reward_zero_std": 0.5, "grad_norm": 0.0010582864632042994, "kl": 0.0011124948505312204, "learning_rate": 4.868330541513405e-05, "loss": -0.012901953421533108, "num_tokens": 26256273.0, "reward": 1.08203125, "reward_std": 0.3592725396156311, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.052181267076068454, "sampling/importance_sampling_ratio/max": 2.9990017414093018, "sampling/importance_sampling_ratio/mean": 0.9508888721466064, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.885161399841309, "sampling/sampling_logp_difference/mean": 0.21054230630397797, "step": 174, "step_time": 198.68682994507253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3267.0, "completions/max_terminated_length": 3267.0, "completions/mean_length": 818.140625, "completions/mean_terminated_length": 818.140625, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6518329381942749, "epoch": 0.43103448275862066, "frac_reward_zero_std": 0.25, "grad_norm": 0.002988795659873789, "kl": 0.00412775349104777, "learning_rate": 4.866772496689787e-05, "loss": 0.003858533687889576, "num_tokens": 26388170.0, "reward": 1.03125, "reward_std": 0.3288387358188629, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.05097940398587121, "sampling/importance_sampling_ratio/max": 2.997185230255127, "sampling/importance_sampling_ratio/mean": 0.9601922035217285, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.480610847473145, "sampling/sampling_logp_difference/mean": 0.1721549779176712, "step": 175, "step_time": 114.63128752307966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3185.0, "completions/mean_length": 1045.515625, "completions/mean_terminated_length": 895.4917602539062, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7153588682413101, "epoch": 0.43349753694581283, "frac_reward_zero_std": 0.5, "grad_norm": 0.004204537629622419, "kl": 0.001135994476499036, "learning_rate": 4.865205540127851e-05, "loss": 0.0316590741276741, "num_tokens": 26534491.0, "reward": 1.21484375, "reward_std": 0.6970276832580566, "rewards/reward_func/mean": 0.1349826388888889, "rewards/reward_func/std": 0.09969028168254429, "sampling/importance_sampling_ratio/max": 2.9958765506744385, "sampling/importance_sampling_ratio/mean": 0.9529693126678467, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.622081756591797, "sampling/sampling_logp_difference/mean": 0.19626040756702423, "step": 176, "step_time": 188.66510831192136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3936.0, "completions/mean_length": 1035.765625, "completions/mean_terminated_length": 937.04833984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7687556147575378, "epoch": 0.43596059113300495, "frac_reward_zero_std": 0.25, "grad_norm": 0.005146910057299965, "kl": 0.0013427632511593401, "learning_rate": 4.863629677727745e-05, "loss": 0.013107987120747566, "num_tokens": 26678796.0, "reward": 1.09375, "reward_std": 0.5730383992195129, "rewards/reward_func/mean": 0.12152777777777778, "rewards/reward_func/std": 0.09791860481103261, "sampling/importance_sampling_ratio/max": 2.995704412460327, "sampling/importance_sampling_ratio/mean": 0.9536083340644836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.321060180664062, "sampling/sampling_logp_difference/mean": 0.1994103193283081, "step": 177, "step_time": 122.20711262966506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2110.0, "completions/mean_length": 889.65625, "completions/mean_terminated_length": 786.2294921875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7085517942905426, "epoch": 0.43842364532019706, "frac_reward_zero_std": 0.75, "grad_norm": 0.00044274036412802093, "kl": 0.001734752906486392, "learning_rate": 4.862044915423149e-05, "loss": -0.007765084970742464, "num_tokens": 26822902.0, "reward": 1.140625, "reward_std": 0.39308255910873413, "rewards/reward_func/mean": 0.1267361111111111, "rewards/reward_func/std": 0.054551392793655396, "sampling/importance_sampling_ratio/max": 2.999730348587036, "sampling/importance_sampling_ratio/mean": 0.9586660861968994, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.441228866577148, "sampling/sampling_logp_difference/mean": 0.18852296471595764, "step": 178, "step_time": 135.35360544803552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3859.0, "completions/mean_length": 951.46875, "completions/mean_terminated_length": 844.7212524414062, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6623150259256363, "epoch": 0.4408866995073892, "frac_reward_zero_std": 0.5, "grad_norm": 0.0013933160245554325, "kl": 0.0018012767832260579, "learning_rate": 4.860451259181259e-05, "loss": -0.001757708378136158, "num_tokens": 26970756.0, "reward": 1.0859375, "reward_std": 0.33694103360176086, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.04956694609589047, "sampling/importance_sampling_ratio/max": 2.996303081512451, "sampling/importance_sampling_ratio/mean": 0.9572619795799255, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.23563003540039, "sampling/sampling_logp_difference/mean": 0.1772717833518982, "step": 179, "step_time": 128.08412980427966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1835.0, "completions/mean_length": 792.875, "completions/mean_terminated_length": 685.7166748046875, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "degenerate_groups_filtered": 0.0, "entropy": 0.688085749745369, "epoch": 0.4433497536945813, "frac_reward_zero_std": 0.5, "grad_norm": 0.0007279197620227103, "kl": 0.0020535228250082582, "learning_rate": 4.8588487150027514e-05, "loss": 0.0036929009947925806, "num_tokens": 27099212.0, "reward": 1.0703125, "reward_std": 0.2762732207775116, "rewards/reward_func/mean": 0.1189236111111111, "rewards/reward_func/std": 0.03492574973238839, "sampling/importance_sampling_ratio/max": 2.986546516418457, "sampling/importance_sampling_ratio/mean": 0.9620383977890015, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.313373565673828, "sampling/sampling_logp_difference/mean": 0.17329855263233185, "step": 180, "step_time": 117.17333939834498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3657.0, "completions/max_terminated_length": 3657.0, "completions/mean_length": 810.515625, "completions/mean_terminated_length": 794.3809814453125, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7161764353513718, "epoch": 0.4458128078817734, "frac_reward_zero_std": 0.25, "grad_norm": 0.011804777204104219, "kl": 0.0012608054676093161, "learning_rate": 4.8572372889217776e-05, "loss": 0.04105086624622345, "num_tokens": 27219901.0, "reward": 1.21875, "reward_std": 0.6066758036613464, "rewards/reward_func/mean": 0.13541666666666666, "rewards/reward_func/std": 0.10416450061731869, "sampling/importance_sampling_ratio/max": 2.998037815093994, "sampling/importance_sampling_ratio/mean": 0.9585379362106323, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 18.048479080200195, "sampling/sampling_logp_difference/mean": 0.17843914031982422, "step": 181, "step_time": 104.87256655702367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3713.0, "completions/max_terminated_length": 3713.0, "completions/mean_length": 837.078125, "completions/mean_terminated_length": 837.078125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7794601023197174, "epoch": 0.4482758620689655, "frac_reward_zero_std": 0.0, "grad_norm": 0.0038115502353250493, "kl": 0.0025799469731282443, "learning_rate": 4.855616987005926e-05, "loss": 0.002493778243660927, "num_tokens": 27355138.0, "reward": 1.05859375, "reward_std": 0.40272432565689087, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.059847907887564764, "sampling/importance_sampling_ratio/max": 2.9982709884643555, "sampling/importance_sampling_ratio/mean": 0.9533101320266724, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.990303039550781, "sampling/sampling_logp_difference/mean": 0.1960030198097229, "step": 182, "step_time": 122.55533957784064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3363.0, "completions/mean_length": 765.6875, "completions/mean_terminated_length": 712.825439453125, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6534744799137115, "epoch": 0.45073891625615764, "frac_reward_zero_std": 0.5, "grad_norm": 0.002555384720048418, "kl": 0.00150683059473522, "learning_rate": 4.853987815356211e-05, "loss": 0.015543782152235508, "num_tokens": 27483294.0, "reward": 1.0078125, "reward_std": 0.21347814798355103, "rewards/reward_func/mean": 0.11197916666666667, "rewards/reward_func/std": 0.03231415732039346, "sampling/importance_sampling_ratio/max": 2.996135950088501, "sampling/importance_sampling_ratio/mean": 0.9614719748497009, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.065330505371094, "sampling/sampling_logp_difference/mean": 0.1773977279663086, "step": 183, "step_time": 123.16616046987474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2184.0, "completions/mean_length": 860.078125, "completions/mean_terminated_length": 704.550048828125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6979289501905441, "epoch": 0.45320197044334976, "frac_reward_zero_std": 0.5, "grad_norm": 0.0016726985274926945, "kl": 0.0025047478557098657, "learning_rate": 4.8523497801070394e-05, "loss": -0.020084768533706665, "num_tokens": 27626163.0, "reward": 1.06640625, "reward_std": 0.33994102478027344, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.049871087074279785, "sampling/importance_sampling_ratio/max": 2.9824655055999756, "sampling/importance_sampling_ratio/mean": 0.9549860954284668, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.605437278747559, "sampling/sampling_logp_difference/mean": 0.18550416827201843, "step": 184, "step_time": 176.04412785521708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3536.0, "completions/mean_length": 1211.46875, "completions/mean_terminated_length": 1118.4193115234375, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6525581926107407, "epoch": 0.45566502463054187, "frac_reward_zero_std": 0.25, "grad_norm": 0.01064758291666748, "kl": 0.0012874963285867125, "learning_rate": 4.8507028874261965e-05, "loss": 0.012987809255719185, "num_tokens": 27781713.0, "reward": 1.16796875, "reward_std": 0.5960429906845093, "rewards/reward_func/mean": 0.12977430555555555, "rewards/reward_func/std": 0.10399173531267378, "sampling/importance_sampling_ratio/max": 2.9994893074035645, "sampling/importance_sampling_ratio/mean": 0.9531520009040833, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.717105865478516, "sampling/sampling_logp_difference/mean": 0.18427221477031708, "step": 185, "step_time": 123.03224639990367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3674.0, "completions/mean_length": 973.265625, "completions/mean_terminated_length": 929.7257690429688, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7046706974506378, "epoch": 0.458128078817734, "frac_reward_zero_std": 0.75, "grad_norm": 0.00037532162180780795, "kl": 0.002055374119663611, "learning_rate": 4.8490471435148174e-05, "loss": 0.0002820357622113079, "num_tokens": 27919314.0, "reward": 1.13671875, "reward_std": 0.3419414758682251, "rewards/reward_func/mean": 0.12630208333333334, "rewards/reward_func/std": 0.04240360524919298, "sampling/importance_sampling_ratio/max": 2.9972267150878906, "sampling/importance_sampling_ratio/mean": 0.9548791646957397, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.075159072875977, "sampling/sampling_logp_difference/mean": 0.19040821492671967, "step": 186, "step_time": 131.1666200091131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3434.0, "completions/mean_length": 1098.90625, "completions/mean_terminated_length": 993.4500732421875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6857569068670273, "epoch": 0.4605911330049261, "frac_reward_zero_std": 0.0, "grad_norm": 0.004633843502208573, "kl": 0.003258775017457083, "learning_rate": 4.8473825546073656e-05, "loss": -0.012959948740899563, "num_tokens": 28092956.0, "reward": 0.96875, "reward_std": 0.4284060597419739, "rewards/reward_func/mean": 0.1076388888888889, "rewards/reward_func/std": 0.06393983297877842, "sampling/importance_sampling_ratio/max": 2.9971022605895996, "sampling/importance_sampling_ratio/mean": 0.9417064785957336, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.373357772827148, "sampling/sampling_logp_difference/mean": 0.2150428593158722, "step": 187, "step_time": 147.29947473318316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2978.0, "completions/mean_length": 1049.4375, "completions/mean_terminated_length": 951.1612548828125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6931847035884857, "epoch": 0.4630541871921182, "frac_reward_zero_std": 0.5, "grad_norm": 0.005104650264393571, "kl": 0.001937075809109956, "learning_rate": 4.845709126971609e-05, "loss": 0.052426815032958984, "num_tokens": 28238888.0, "reward": 1.12890625, "reward_std": 0.6681414842605591, "rewards/reward_func/mean": 0.1254340277777778, "rewards/reward_func/std": 0.09152780349055926, "sampling/importance_sampling_ratio/max": 2.999912738800049, "sampling/importance_sampling_ratio/mean": 0.9497792720794678, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.748735427856445, "sampling/sampling_logp_difference/mean": 0.19750040769577026, "step": 188, "step_time": 117.82483472116292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3593.0, "completions/mean_length": 1013.9375, "completions/mean_terminated_length": 863.1000366210938, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7308681607246399, "epoch": 0.46551724137931033, "frac_reward_zero_std": 0.75, "grad_norm": 0.0006965873116807276, "kl": 0.002661357371835038, "learning_rate": 4.844026866908595e-05, "loss": 0.0032661345321685076, "num_tokens": 28395908.0, "reward": 1.07421875, "reward_std": 0.2734251022338867, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.03352663583225674, "sampling/importance_sampling_ratio/max": 2.9960858821868896, "sampling/importance_sampling_ratio/mean": 0.9529147148132324, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.626492500305176, "sampling/sampling_logp_difference/mean": 0.19693899154663086, "step": 189, "step_time": 134.75431111990474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2363.0, "completions/max_terminated_length": 2363.0, "completions/mean_length": 820.734375, "completions/mean_terminated_length": 796.3386840820312, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "degenerate_groups_filtered": 0.0, "entropy": 0.720972329378128, "epoch": 0.46798029556650245, "frac_reward_zero_std": 0.5, "grad_norm": 0.0023207172647921556, "kl": 0.002978406089823693, "learning_rate": 4.8423357807526325e-05, "loss": -0.020184550434350967, "num_tokens": 28532787.0, "reward": 1.04296875, "reward_std": 0.31038472056388855, "rewards/reward_func/mean": 0.11588541666666667, "rewards/reward_func/std": 0.04780791699886322, "sampling/importance_sampling_ratio/max": 2.9988293647766113, "sampling/importance_sampling_ratio/mean": 0.957587480545044, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.802742958068848, "sampling/sampling_logp_difference/mean": 0.18637000024318695, "step": 190, "step_time": 77.44645439600572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4033.0, "completions/mean_length": 1296.359375, "completions/mean_terminated_length": 1109.7166748046875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6860456019639969, "epoch": 0.47044334975369456, "frac_reward_zero_std": 0.25, "grad_norm": 0.0019827677735223988, "kl": 0.0025322052533738315, "learning_rate": 4.840635874871259e-05, "loss": -0.015175355598330498, "num_tokens": 28700730.0, "reward": 1.03125, "reward_std": 0.3818812966346741, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.05688919126987457, "sampling/importance_sampling_ratio/max": 2.9983668327331543, "sampling/importance_sampling_ratio/mean": 0.9488115906715393, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.628395080566406, "sampling/sampling_logp_difference/mean": 0.19605514407157898, "step": 191, "step_time": 153.48830994497985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2290.0, "completions/max_terminated_length": 2290.0, "completions/mean_length": 766.453125, "completions/mean_terminated_length": 766.453125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7697373479604721, "epoch": 0.4729064039408867, "frac_reward_zero_std": 0.75, "grad_norm": 0.0014556993322577337, "kl": 0.002503739349776879, "learning_rate": 4.838927155665225e-05, "loss": -0.0015111538814380765, "num_tokens": 28825287.0, "reward": 1.03125, "reward_std": 0.23779743909835815, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.034599056674374476, "sampling/importance_sampling_ratio/max": 2.9957094192504883, "sampling/importance_sampling_ratio/mean": 0.9514071345329285, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.246959686279297, "sampling/sampling_logp_difference/mean": 0.20502641797065735, "step": 192, "step_time": 74.68089395412244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 1873.0, "completions/mean_length": 547.671875, "completions/mean_terminated_length": 487.34423828125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "degenerate_groups_filtered": 0.0, "entropy": 0.77506323158741, "epoch": 0.4753694581280788, "frac_reward_zero_std": 0.25, "grad_norm": 0.003785555393182919, "kl": 0.006008016353007406, "learning_rate": 4.837209629568462e-05, "loss": 0.001519299577921629, "num_tokens": 28937410.0, "reward": 1.07421875, "reward_std": 0.3637320101261139, "rewards/reward_func/mean": 0.1193576388888889, "rewards/reward_func/std": 0.052706441945499845, "sampling/importance_sampling_ratio/max": 2.9997804164886475, "sampling/importance_sampling_ratio/mean": 0.956741213798523, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.686527252197266, "sampling/sampling_logp_difference/mean": 0.19631287455558777, "step": 193, "step_time": 138.0256498081144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3606.0, "completions/mean_length": 976.28125, "completions/mean_terminated_length": 875.6451416015625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8006569296121597, "epoch": 0.47783251231527096, "frac_reward_zero_std": 0.25, "grad_norm": 0.0010860159991696286, "kl": 0.0022052104177419096, "learning_rate": 4.8354833030480674e-05, "loss": -0.01513269916176796, "num_tokens": 29080052.0, "reward": 1.046875, "reward_std": 0.31140682101249695, "rewards/reward_func/mean": 0.11631944444444445, "rewards/reward_func/std": 0.04530912637710571, "sampling/importance_sampling_ratio/max": 2.997826337814331, "sampling/importance_sampling_ratio/mean": 0.9526568651199341, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.61501693725586, "sampling/sampling_logp_difference/mean": 0.20253780484199524, "step": 194, "step_time": 141.632570264861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1685.0, "completions/mean_length": 889.46875, "completions/mean_terminated_length": 786.0322265625, "completions/min_length": 199.0, "completions/min_terminated_length": 199.0, "degenerate_groups_filtered": 1.0, "entropy": 0.6076531410217285, "epoch": 0.4802955665024631, "frac_reward_zero_std": 0.5, "grad_norm": 0.006373856461449096, "kl": 0.0018063990282826126, "learning_rate": 4.833748182604273e-05, "loss": -0.0016068057157099247, "num_tokens": 29222882.0, "reward": 1.13671875, "reward_std": 0.4691466689109802, "rewards/reward_func/mean": 0.12630208333333334, "rewards/reward_func/std": 0.07778164744377136, "sampling/importance_sampling_ratio/max": 2.9992918968200684, "sampling/importance_sampling_ratio/mean": 0.9589847326278687, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.804981231689453, "sampling/sampling_logp_difference/mean": 0.17736108601093292, "step": 195, "step_time": 119.6795916960109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3320.0, "completions/mean_length": 1108.359375, "completions/mean_terminated_length": 961.4261474609375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7933976054191589, "epoch": 0.4827586206896552, "frac_reward_zero_std": 0.0, "grad_norm": 0.014627021708944242, "kl": 0.002043090295046568, "learning_rate": 4.832004274770422e-05, "loss": -0.04575319588184357, "num_tokens": 29384265.0, "reward": 1.484375, "reward_std": 1.0136713981628418, "rewards/reward_func/mean": 0.16493055555555555, "rewards/reward_func/std": 0.17790989412201774, "sampling/importance_sampling_ratio/max": 2.998352527618408, "sampling/importance_sampling_ratio/mean": 0.9412397742271423, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.949785232543945, "sampling/sampling_logp_difference/mean": 0.22926440834999084, "step": 196, "step_time": 129.6598045709543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2945.0, "completions/max_terminated_length": 2945.0, "completions/mean_length": 1006.734375, "completions/mean_terminated_length": 1006.734375, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "degenerate_groups_filtered": 1.0, "entropy": 0.732940286397934, "epoch": 0.4852216748768473, "frac_reward_zero_std": 0.5, "grad_norm": 0.004308614389485858, "kl": 0.001794125506421551, "learning_rate": 4.8302515861129474e-05, "loss": 0.03310343995690346, "num_tokens": 29536408.0, "reward": 1.33984375, "reward_std": 0.9315535426139832, "rewards/reward_func/mean": 0.1488715277777778, "rewards/reward_func/std": 0.1277098986837599, "sampling/importance_sampling_ratio/max": 2.995858669281006, "sampling/importance_sampling_ratio/mean": 0.9469768404960632, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.225045204162598, "sampling/sampling_logp_difference/mean": 0.20466913282871246, "step": 197, "step_time": 101.47293852618895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3559.0, "completions/mean_length": 1037.3125, "completions/mean_terminated_length": 938.6451416015625, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7546299546957016, "epoch": 0.4876847290640394, "frac_reward_zero_std": 0.5, "grad_norm": 0.0023699039274578254, "kl": 0.0021884184097871184, "learning_rate": 4.828490123231342e-05, "loss": 0.009993281215429306, "num_tokens": 29680908.0, "reward": 1.06640625, "reward_std": 0.33107027411460876, "rewards/reward_func/mean": 0.11848958333333333, "rewards/reward_func/std": 0.047586959269311696, "sampling/importance_sampling_ratio/max": 2.991670846939087, "sampling/importance_sampling_ratio/mean": 0.9526986479759216, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.980742454528809, "sampling/sampling_logp_difference/mean": 0.20294223725795746, "step": 198, "step_time": 187.45196510385722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3956.0, "completions/mean_length": 1104.125, "completions/mean_terminated_length": 904.6666870117188, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7587642967700958, "epoch": 0.49014778325123154, "frac_reward_zero_std": 0.25, "grad_norm": 0.0005524746547252956, "kl": 0.0027338668005540967, "learning_rate": 4.8267198927581415e-05, "loss": -0.0013192156329751015, "num_tokens": 29842468.0, "reward": 1.171875, "reward_std": 0.39559829235076904, "rewards/reward_func/mean": 0.13020833333333334, "rewards/reward_func/std": 0.05048796162009239, "sampling/importance_sampling_ratio/max": 2.996952533721924, "sampling/importance_sampling_ratio/mean": 0.950311541557312, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.986047744750977, "sampling/sampling_logp_difference/mean": 0.20628628134727478, "step": 199, "step_time": 132.54166667349637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4023.0, "completions/mean_length": 965.59375, "completions/mean_terminated_length": 915.90478515625, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "degenerate_groups_filtered": 1.0, "entropy": 0.686070442199707, "epoch": 0.49261083743842365, "frac_reward_zero_std": 0.5, "grad_norm": 0.007126499731189531, "kl": 0.001430476550012827, "learning_rate": 4.824940901358889e-05, "loss": 0.036235012114048004, "num_tokens": 29997386.0, "reward": 1.08203125, "reward_std": 0.663765549659729, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.10387398964828914, "sampling/importance_sampling_ratio/max": 2.998133659362793, "sampling/importance_sampling_ratio/mean": 0.9536129832267761, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.036049842834473, "sampling/sampling_logp_difference/mean": 0.19290143251419067, "step": 200, "step_time": 177.09617541497573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3732.0, "completions/mean_length": 891.75, "completions/mean_terminated_length": 734.1638793945312, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7913601100444794, "epoch": 0.49507389162561577, "frac_reward_zero_std": 0.5, "grad_norm": 0.006751393097530581, "kl": 0.0017078560777008533, "learning_rate": 4.82315315573212e-05, "loss": 0.0010975832119584084, "num_tokens": 30134826.0, "reward": 1.1953125, "reward_std": 0.6921738982200623, "rewards/reward_func/mean": 0.1328125, "rewards/reward_func/std": 0.11135281870762508, "sampling/importance_sampling_ratio/max": 2.9937379360198975, "sampling/importance_sampling_ratio/mean": 0.9510517120361328, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.499581336975098, "sampling/sampling_logp_difference/mean": 0.2039942890405655, "step": 201, "step_time": 125.6223954288289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3002.0, "completions/mean_length": 916.328125, "completions/mean_terminated_length": 864.5806274414062, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8502542078495026, "epoch": 0.4975369458128079, "frac_reward_zero_std": 0.0, "grad_norm": 0.010512409373633538, "kl": 0.0041085208067670465, "learning_rate": 4.8213566626093316e-05, "loss": 0.04938031733036041, "num_tokens": 30283039.0, "reward": 1.19140625, "reward_std": 0.872578501701355, "rewards/reward_func/mean": 0.1323784722222222, "rewards/reward_func/std": 0.1458116587665346, "sampling/importance_sampling_ratio/max": 2.9969401359558105, "sampling/importance_sampling_ratio/mean": 0.9425897598266602, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.473159790039062, "sampling/sampling_logp_difference/mean": 0.22600840032100677, "step": 202, "step_time": 131.78623059717938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3201.0, "completions/mean_length": 1062.703125, "completions/mean_terminated_length": 978.88134765625, "completions/min_length": 1.0, "completions/min_terminated_length": 157.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6913492828607559, "epoch": 0.5, "frac_reward_zero_std": 0.5, "grad_norm": 0.0015925848185912536, "kl": 0.001410837227012962, "learning_rate": 4.819551428754957e-05, "loss": -0.047750379890203476, "num_tokens": 30434060.0, "reward": 1.03125, "reward_std": 0.34646743535995483, "rewards/reward_func/mean": 0.11458333333333333, "rewards/reward_func/std": 0.0521190000904931, "sampling/importance_sampling_ratio/max": 2.9992289543151855, "sampling/importance_sampling_ratio/mean": 0.9494022727012634, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.937480926513672, "sampling/sampling_logp_difference/mean": 0.19053122401237488, "step": 203, "step_time": 122.92247278685682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3885.0, "completions/mean_length": 1074.046875, "completions/mean_terminated_length": 976.5645141601562, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "degenerate_groups_filtered": 1.0, "entropy": 0.654021367430687, "epoch": 0.5024630541871922, "frac_reward_zero_std": 0.5, "grad_norm": 0.0051969291675550665, "kl": 0.002041646250290796, "learning_rate": 4.8177374609663415e-05, "loss": 0.015631053596735, "num_tokens": 30597503.0, "reward": 1.1875, "reward_std": 0.634647786617279, "rewards/reward_func/mean": 0.13194444444444445, "rewards/reward_func/std": 0.11371641523308224, "sampling/importance_sampling_ratio/max": 2.996506690979004, "sampling/importance_sampling_ratio/mean": 0.9503429532051086, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.598240852355957, "sampling/sampling_logp_difference/mean": 0.1922573447227478, "step": 204, "step_time": 163.13331845588982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3323.0, "completions/mean_length": 1194.078125, "completions/mean_terminated_length": 1100.4676513671875, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6860644668340683, "epoch": 0.5049261083743842, "frac_reward_zero_std": 0.75, "grad_norm": 0.0017076158671597208, "kl": 0.004838668974116445, "learning_rate": 4.815914766073719e-05, "loss": 0.00643888721242547, "num_tokens": 30764340.0, "reward": 1.18359375, "reward_std": 0.3965180516242981, "rewards/reward_func/mean": 0.13151041666666666, "rewards/reward_func/std": 0.047183099720213145, "sampling/importance_sampling_ratio/max": 2.9985768795013428, "sampling/importance_sampling_ratio/mean": 0.9507815837860107, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.72639274597168, "sampling/sampling_logp_difference/mean": 0.19618508219718933, "step": 205, "step_time": 133.87398360297084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3245.0, "completions/mean_length": 952.671875, "completions/mean_terminated_length": 855.6720581054688, "completions/min_length": 65.0, "completions/min_terminated_length": 65.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7687894254922867, "epoch": 0.5073891625615764, "frac_reward_zero_std": 0.75, "grad_norm": 0.002176552438569466, "kl": 0.0018984676571562886, "learning_rate": 4.8140833509401815e-05, "loss": 0.006664093118160963, "num_tokens": 30918895.0, "reward": 1.08203125, "reward_std": 0.3119787275791168, "rewards/reward_func/mean": 0.12022569444444445, "rewards/reward_func/std": 0.043059426049391426, "sampling/importance_sampling_ratio/max": 2.9998011589050293, "sampling/importance_sampling_ratio/mean": 0.9475848078727722, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.37496280670166, "sampling/sampling_logp_difference/mean": 0.22000627219676971, "step": 206, "step_time": 122.86152748181485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3039.0, "completions/mean_length": 886.515625, "completions/mean_terminated_length": 828.9677124023438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6943669766187668, "epoch": 0.5098522167487685, "frac_reward_zero_std": 0.5, "grad_norm": 0.004818923673628531, "kl": 0.0026133455976378173, "learning_rate": 4.812243222461658e-05, "loss": 0.020810682326555252, "num_tokens": 31068336.0, "reward": 1.41796875, "reward_std": 0.7546873092651367, "rewards/reward_func/mean": 0.15755208333333334, "rewards/reward_func/std": 0.1127622624238332, "sampling/importance_sampling_ratio/max": 2.9922666549682617, "sampling/importance_sampling_ratio/mean": 0.9491963386535645, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.743640899658203, "sampling/sampling_logp_difference/mean": 0.19928023219108582, "step": 207, "step_time": 126.38416155404411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3327.0, "completions/max_terminated_length": 3327.0, "completions/mean_length": 980.890625, "completions/mean_terminated_length": 975.0806274414062, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7900367081165314, "epoch": 0.5123152709359606, "frac_reward_zero_std": 0.5, "grad_norm": 0.007988081424164765, "kl": 0.0018078716529998928, "learning_rate": 4.8103943875668844e-05, "loss": 0.017407327890396118, "num_tokens": 31224729.0, "reward": 1.24609375, "reward_std": 0.613169252872467, "rewards/reward_func/mean": 0.1384548611111111, "rewards/reward_func/std": 0.1027386552757687, "sampling/importance_sampling_ratio/max": 2.998058795928955, "sampling/importance_sampling_ratio/mean": 0.9472312927246094, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.543306350708008, "sampling/sampling_logp_difference/mean": 0.21559563279151917, "step": 208, "step_time": 107.44903364474885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3869.0, "completions/mean_length": 965.921875, "completions/mean_terminated_length": 914.9515991210938, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7459463179111481, "epoch": 0.5147783251231527, "frac_reward_zero_std": 0.25, "grad_norm": 0.010340537458138942, "kl": 0.0023739843745715916, "learning_rate": 4.8085368532173804e-05, "loss": -0.028774894773960114, "num_tokens": 31359492.0, "reward": 1.08984375, "reward_std": 0.5998796224594116, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.10784303976429833, "sampling/importance_sampling_ratio/max": 2.9987199306488037, "sampling/importance_sampling_ratio/mean": 0.9521645903587341, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.040218353271484, "sampling/sampling_logp_difference/mean": 0.20044687390327454, "step": 209, "step_time": 184.2058972257655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3810.0, "completions/mean_length": 1076.546875, "completions/mean_terminated_length": 1007.9000244140625, "completions/min_length": 1.0, "completions/min_terminated_length": 75.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7519822269678116, "epoch": 0.5172413793103449, "frac_reward_zero_std": 0.0, "grad_norm": 0.008282575286698903, "kl": 0.0027343417750671506, "learning_rate": 4.806670626407422e-05, "loss": -0.029934097081422806, "num_tokens": 31517863.0, "reward": 1.08984375, "reward_std": 0.6612386107444763, "rewards/reward_func/mean": 0.12109375, "rewards/reward_func/std": 0.13787324395444658, "sampling/importance_sampling_ratio/max": 2.9987173080444336, "sampling/importance_sampling_ratio/mean": 0.9486758708953857, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.930124282836914, "sampling/sampling_logp_difference/mean": 0.20651331543922424, "step": 210, "step_time": 130.95431489613838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3584.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 991.34375, "completions/mean_terminated_length": 991.34375, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7369837611913681, "epoch": 0.5197044334975369, "frac_reward_zero_std": 0.5, "grad_norm": 0.005166807938091367, "kl": 0.003004661004524678, "learning_rate": 4.804795714164015e-05, "loss": 0.04796065762639046, "num_tokens": 31662573.0, "reward": 1.0859375, "reward_std": 0.7046032547950745, "rewards/reward_func/mean": 0.12065972222222222, "rewards/reward_func/std": 0.10800171229574415, "sampling/importance_sampling_ratio/max": 2.9918057918548584, "sampling/importance_sampling_ratio/mean": 0.9489620327949524, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.318878173828125, "sampling/sampling_logp_difference/mean": 0.19743148982524872, "step": 211, "step_time": 103.4146853510756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 1794.0, "completions/mean_length": 828.921875, "completions/mean_terminated_length": 769.5645141601562, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7094843238592148, "epoch": 0.5221674876847291, "frac_reward_zero_std": 0.25, "grad_norm": 0.010080593679367157, "kl": 0.0021559473825618625, "learning_rate": 4.8029121235468696e-05, "loss": 0.07708187401294708, "num_tokens": 31799144.0, "reward": 1.32421875, "reward_std": 0.9622638821601868, "rewards/reward_func/mean": 0.14713541666666666, "rewards/reward_func/std": 0.13708895444869995, "sampling/importance_sampling_ratio/max": 2.9905505180358887, "sampling/importance_sampling_ratio/mean": 0.9552954435348511, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.449729919433594, "sampling/sampling_logp_difference/mean": 0.1984173059463501, "step": 212, "step_time": 126.7395717408508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3203.0, "completions/mean_length": 1188.765625, "completions/mean_terminated_length": 1112.57373046875, "completions/min_length": 22.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8555341958999634, "epoch": 0.5246305418719212, "frac_reward_zero_std": 0.0, "grad_norm": 0.007695774946068362, "kl": 0.00277452525915578, "learning_rate": 4.8010198616483736e-05, "loss": 0.0049737924709916115, "num_tokens": 31960265.0, "reward": 1.05859375, "reward_std": 0.7067449688911438, "rewards/reward_func/mean": 0.11762152777777778, "rewards/reward_func/std": 0.11894809040758345, "sampling/importance_sampling_ratio/max": 2.9988372325897217, "sampling/importance_sampling_ratio/mean": 0.9447873830795288, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.881756782531738, "sampling/sampling_logp_difference/mean": 0.22210858762264252, "step": 213, "step_time": 136.29223776236176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3721.0, "completions/mean_length": 1243.328125, "completions/mean_terminated_length": 1053.1500244140625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7398181259632111, "epoch": 0.5270935960591133, "frac_reward_zero_std": 0.5, "grad_norm": 0.006997510070903853, "kl": 0.0023148250475060195, "learning_rate": 4.799118935593563e-05, "loss": 0.01846671663224697, "num_tokens": 32123518.0, "reward": 1.13671875, "reward_std": 0.6885251402854919, "rewards/reward_func/mean": 0.12630208333333334, "rewards/reward_func/std": 0.1024610847234726, "sampling/importance_sampling_ratio/max": 2.9988057613372803, "sampling/importance_sampling_ratio/mean": 0.9522387385368347, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.218445777893066, "sampling/sampling_logp_difference/mean": 0.20401646196842194, "step": 214, "step_time": 133.83199871121906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3474.0, "completions/mean_length": 984.734375, "completions/mean_terminated_length": 884.3709716796875, "completions/min_length": 70.0, "completions/min_terminated_length": 70.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7282632738351822, "epoch": 0.5295566502463054, "frac_reward_zero_std": 0.25, "grad_norm": 0.006373666257538889, "kl": 0.0025452679255977273, "learning_rate": 4.797209352540101e-05, "loss": 0.06603739410638809, "num_tokens": 32279373.0, "reward": 1.22265625, "reward_std": 0.9429323673248291, "rewards/reward_func/mean": 0.13585069444444445, "rewards/reward_func/std": 0.14530256390571594, "sampling/importance_sampling_ratio/max": 2.9987716674804688, "sampling/importance_sampling_ratio/mean": 0.9473279118537903, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.93663501739502, "sampling/sampling_logp_difference/mean": 0.20334208011627197, "step": 215, "step_time": 171.04516539885662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3199.0, "completions/mean_length": 960.265625, "completions/mean_terminated_length": 872.3933715820312, "completions/min_length": 191.0, "completions/min_terminated_length": 191.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6828020513057709, "epoch": 0.5320197044334976, "frac_reward_zero_std": 0.0, "grad_norm": 0.00906798233704518, "kl": 0.002584115689387545, "learning_rate": 4.7952911196782426e-05, "loss": 0.053639333695173264, "num_tokens": 32419022.0, "reward": 1.2265625, "reward_std": 0.8860904574394226, "rewards/reward_func/mean": 0.1362847222222222, "rewards/reward_func/std": 0.14273416499296823, "sampling/importance_sampling_ratio/max": 2.9965453147888184, "sampling/importance_sampling_ratio/mean": 0.9599682092666626, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.712519645690918, "sampling/sampling_logp_difference/mean": 0.17587494850158691, "step": 216, "step_time": 117.92797983251512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 3881.0, "completions/max_terminated_length": 3881.0, "completions/mean_length": 994.046875, "completions/mean_terminated_length": 988.5806274414062, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8513800352811813, "epoch": 0.5344827586206896, "frac_reward_zero_std": 0.0, "grad_norm": 0.017120938841582653, "kl": 0.004447525599971414, "learning_rate": 4.793364244230818e-05, "loss": 0.09198958426713943, "num_tokens": 32584177.0, "reward": 1.375, "reward_std": 1.273976445198059, "rewards/reward_func/mean": 0.1527777777777778, "rewards/reward_func/std": 0.18192580342292786, "sampling/importance_sampling_ratio/max": 2.997528314590454, "sampling/importance_sampling_ratio/mean": 0.9468756914138794, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.249998092651367, "sampling/sampling_logp_difference/mean": 0.2235032320022583, "step": 217, "step_time": 127.43058389122598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3963.0, "completions/max_terminated_length": 3963.0, "completions/mean_length": 814.8125, "completions/mean_terminated_length": 818.3333740234375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7400757819414139, "epoch": 0.5369458128078818, "frac_reward_zero_std": 0.0, "grad_norm": 0.02154058014893653, "kl": 0.004508270707447082, "learning_rate": 4.791428733453195e-05, "loss": -0.06291753053665161, "num_tokens": 32717189.0, "reward": 1.4296875, "reward_std": 1.1822670698165894, "rewards/reward_func/mean": 0.15885416666666666, "rewards/reward_func/std": 0.1904856049352222, "sampling/importance_sampling_ratio/max": 2.9951188564300537, "sampling/importance_sampling_ratio/mean": 0.9537706971168518, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.874495506286621, "sampling/sampling_logp_difference/mean": 0.18929260969161987, "step": 218, "step_time": 130.0812120535411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2717.0, "completions/mean_length": 1357.421875, "completions/mean_terminated_length": 1252.7540283203125, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7078727632761002, "epoch": 0.5394088669950738, "frac_reward_zero_std": 0.0, "grad_norm": 0.015914954426622312, "kl": 0.003292074310593307, "learning_rate": 4.78948459463326e-05, "loss": 0.054672300815582275, "num_tokens": 32895600.0, "reward": 1.6640625, "reward_std": 1.4741278886795044, "rewards/reward_func/mean": 0.18489583333333334, "rewards/reward_func/std": 0.2328646464480294, "sampling/importance_sampling_ratio/max": 2.997037410736084, "sampling/importance_sampling_ratio/mean": 0.943734347820282, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.8037109375, "sampling/sampling_logp_difference/mean": 0.20783142745494843, "step": 219, "step_time": 193.00463135214522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2831.0, "completions/mean_length": 1199.609375, "completions/mean_terminated_length": 1019.6101684570312, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7149243652820587, "epoch": 0.541871921182266, "frac_reward_zero_std": 0.0, "grad_norm": 0.014175892165371774, "kl": 0.005893846391700208, "learning_rate": 4.7875318350913846e-05, "loss": -0.022984549403190613, "num_tokens": 33065783.0, "reward": 1.546875, "reward_std": 1.3008506298065186, "rewards/reward_func/mean": 0.171875, "rewards/reward_func/std": 0.198716941393084, "sampling/importance_sampling_ratio/max": 2.997241735458374, "sampling/importance_sampling_ratio/mean": 0.9502947330474854, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.1736421585083, "sampling/sampling_logp_difference/mean": 0.19458694756031036, "step": 220, "step_time": 184.47228501015343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2686.0, "completions/mean_length": 1140.203125, "completions/mean_terminated_length": 994.8359985351562, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "degenerate_groups_filtered": 0.0, "entropy": 0.708164781332016, "epoch": 0.5443349753694581, "frac_reward_zero_std": 0.5, "grad_norm": 0.01514246486059271, "kl": 0.005792527925223112, "learning_rate": 4.785570462180402e-05, "loss": 0.04811932519078255, "num_tokens": 33233828.0, "reward": 1.4375, "reward_std": 1.2328184843063354, "rewards/reward_func/mean": 0.1597222222222222, "rewards/reward_func/std": 0.1681312604082955, "sampling/importance_sampling_ratio/max": 2.9987242221832275, "sampling/importance_sampling_ratio/mean": 0.9442251920700073, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.4033203125, "sampling/sampling_logp_difference/mean": 0.21833211183547974, "step": 221, "step_time": 153.67408644291572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3886.0, "completions/mean_length": 1162.671875, "completions/mean_terminated_length": 1068.04833984375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7060115039348602, "epoch": 0.5467980295566502, "frac_reward_zero_std": 0.0, "grad_norm": 0.018054004424587895, "kl": 0.00600872898939997, "learning_rate": 4.7836004832855776e-05, "loss": 0.15034234523773193, "num_tokens": 33395023.0, "reward": 1.703125, "reward_std": 1.6084320545196533, "rewards/reward_func/mean": 0.1892361111111111, "rewards/reward_func/std": 0.22175164024035135, "sampling/importance_sampling_ratio/max": 2.999399185180664, "sampling/importance_sampling_ratio/mean": 0.9499921798706055, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.749910354614258, "sampling/sampling_logp_difference/mean": 0.19778406620025635, "step": 222, "step_time": 133.30245491000824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3866.0, "completions/mean_length": 1109.5, "completions/mean_terminated_length": 949.2542114257812, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7275642454624176, "epoch": 0.5492610837438424, "frac_reward_zero_std": 0.0, "grad_norm": 0.021749090980709224, "kl": 0.012067305855453014, "learning_rate": 4.781621905824579e-05, "loss": 0.06272266805171967, "num_tokens": 33556591.0, "reward": 1.71875, "reward_std": 1.5042927265167236, "rewards/reward_func/mean": 0.1909722222222222, "rewards/reward_func/std": 0.22010938243733513, "sampling/importance_sampling_ratio/max": 2.992997169494629, "sampling/importance_sampling_ratio/mean": 0.9458306431770325, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.075740814208984, "sampling/sampling_logp_difference/mean": 0.20269128680229187, "step": 223, "step_time": 133.02171413018368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3887.0, "completions/mean_length": 1109.21875, "completions/mean_terminated_length": 972.7368774414062, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7367782890796661, "epoch": 0.5517241379310345, "frac_reward_zero_std": 0.0, "grad_norm": 0.02082368102789195, "kl": 0.01568816788494587, "learning_rate": 4.779634737247455e-05, "loss": 0.18037329614162445, "num_tokens": 33709229.0, "reward": 1.96484375, "reward_std": 1.7749732732772827, "rewards/reward_func/mean": 0.2183159722222222, "rewards/reward_func/std": 0.26773934563000995, "sampling/importance_sampling_ratio/max": 2.999913454055786, "sampling/importance_sampling_ratio/mean": 0.9544388651847839, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.428032875061035, "sampling/sampling_logp_difference/mean": 0.18823330104351044, "step": 224, "step_time": 120.03369585401379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2172.0, "completions/mean_length": 880.921875, "completions/mean_terminated_length": 819.0322265625, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6620640158653259, "epoch": 0.5541871921182266, "frac_reward_zero_std": 0.0, "grad_norm": 0.0348405270348598, "kl": 0.021940368227660656, "learning_rate": 4.777638985036599e-05, "loss": 0.03423825651407242, "num_tokens": 33854472.0, "reward": 2.578125, "reward_std": 1.9969595670700073, "rewards/reward_func/mean": 0.2864583333333333, "rewards/reward_func/std": 0.2823880405889617, "sampling/importance_sampling_ratio/max": 2.9987571239471436, "sampling/importance_sampling_ratio/mean": 0.9559164643287659, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.310111045837402, "sampling/sampling_logp_difference/mean": 0.17907507717609406, "step": 225, "step_time": 127.27977883489802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3468.0, "completions/mean_length": 1223.015625, "completions/mean_terminated_length": 1161.61669921875, "completions/min_length": 16.0, "completions/min_terminated_length": 300.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6341233402490616, "epoch": 0.5566502463054187, "frac_reward_zero_std": 0.0, "grad_norm": 0.030284450173885293, "kl": 0.0230104043148458, "learning_rate": 4.7756346567067255e-05, "loss": -0.21584941446781158, "num_tokens": 34022121.0, "reward": 3.34375, "reward_std": 2.269754648208618, "rewards/reward_func/mean": 0.3715277777777778, "rewards/reward_func/std": 0.3541766901810964, "sampling/importance_sampling_ratio/max": 2.9997711181640625, "sampling/importance_sampling_ratio/mean": 0.9506696462631226, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.499972343444824, "sampling/sampling_logp_difference/mean": 0.18650886416435242, "step": 226, "step_time": 124.42867265990935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3246.0, "completions/mean_length": 1110.9375, "completions/mean_terminated_length": 1049.2950439453125, "completions/min_length": 371.0, "completions/min_terminated_length": 371.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5698187798261642, "epoch": 0.5591133004926109, "frac_reward_zero_std": 0.0, "grad_norm": 0.03229629907628574, "kl": 0.015496046748012304, "learning_rate": 4.773621759804844e-05, "loss": -0.08266813308000565, "num_tokens": 34176565.0, "reward": 2.4140625, "reward_std": 1.8820469379425049, "rewards/reward_func/mean": 0.2682291666666667, "rewards/reward_func/std": 0.25015421791209114, "sampling/importance_sampling_ratio/max": 2.9965968132019043, "sampling/importance_sampling_ratio/mean": 0.9610643982887268, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.3748140335083, "sampling/sampling_logp_difference/mean": 0.16262675821781158, "step": 227, "step_time": 112.16787964990363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4086.0, "completions/max_terminated_length": 4086.0, "completions/mean_length": 1114.328125, "completions/mean_terminated_length": 1115.245849609375, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6151544153690338, "epoch": 0.5615763546798029, "frac_reward_zero_std": 0.0, "grad_norm": 0.041034398035143124, "kl": 0.031581724528223276, "learning_rate": 4.771600301910224e-05, "loss": 0.06149850785732269, "num_tokens": 34339738.0, "reward": 3.2578125, "reward_std": 2.1524112224578857, "rewards/reward_func/mean": 0.3619791666666667, "rewards/reward_func/std": 0.3190525414215194, "sampling/importance_sampling_ratio/max": 2.9979159832000732, "sampling/importance_sampling_ratio/mean": 0.9513490796089172, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.436819076538086, "sampling/sampling_logp_difference/mean": 0.18058064579963684, "step": 228, "step_time": 127.38730531884357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3669.0, "completions/mean_length": 1181.671875, "completions/mean_terminated_length": 965.132080078125, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6493166834115982, "epoch": 0.5640394088669951, "frac_reward_zero_std": 0.0, "grad_norm": 0.02913287172531979, "kl": 0.02364873979240656, "learning_rate": 4.769570290634373e-05, "loss": -0.0989631861448288, "num_tokens": 34506581.0, "reward": 3.60546875, "reward_std": 2.1647448539733887, "rewards/reward_func/mean": 0.4006076388888889, "rewards/reward_func/std": 0.3339156011740367, "sampling/importance_sampling_ratio/max": 2.9985804557800293, "sampling/importance_sampling_ratio/mean": 0.9563257694244385, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.810356140136719, "sampling/sampling_logp_difference/mean": 0.17818036675453186, "step": 229, "step_time": 133.75939935678616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2632.0, "completions/mean_length": 767.6875, "completions/mean_terminated_length": 718.5, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5988651365041733, "epoch": 0.5665024630541872, "frac_reward_zero_std": 0.0, "grad_norm": 0.05832143190503159, "kl": 0.0540700601413846, "learning_rate": 4.767531733621004e-05, "loss": -0.026409372687339783, "num_tokens": 34635841.0, "reward": 4.26171875, "reward_std": 1.9197334051132202, "rewards/reward_func/mean": 0.4735243055555556, "rewards/reward_func/std": 0.30134472085369957, "sampling/importance_sampling_ratio/max": 2.9955201148986816, "sampling/importance_sampling_ratio/mean": 0.9635403752326965, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.575170516967773, "sampling/sampling_logp_difference/mean": 0.16176164150238037, "step": 230, "step_time": 118.32405733293854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2725.0, "completions/mean_length": 1067.234375, "completions/mean_terminated_length": 954.6610107421875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5734339356422424, "epoch": 0.5689655172413793, "frac_reward_zero_std": 0.0, "grad_norm": 0.03233225495323699, "kl": 0.024783702800050378, "learning_rate": 4.765484638546005e-05, "loss": 0.08640626072883606, "num_tokens": 34781952.0, "reward": 3.7734375, "reward_std": 2.134708881378174, "rewards/reward_func/mean": 0.4192708333333333, "rewards/reward_func/std": 0.32498767226934433, "sampling/importance_sampling_ratio/max": 2.9994750022888184, "sampling/importance_sampling_ratio/mean": 0.9611262679100037, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.97213363647461, "sampling/sampling_logp_difference/mean": 0.16367268562316895, "step": 231, "step_time": 114.58008486474864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3762.0, "completions/mean_length": 1316.703125, "completions/mean_terminated_length": 995.4509887695312, "completions/min_length": 1.0, "completions/min_terminated_length": 277.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6045290231704712, "epoch": 0.5714285714285714, "frac_reward_zero_std": 0.0, "grad_norm": 0.025361740347373922, "kl": 0.025120028294622898, "learning_rate": 4.7634290131174184e-05, "loss": -0.23532718420028687, "num_tokens": 34954989.0, "reward": 3.52734375, "reward_std": 2.2471840381622314, "rewards/reward_func/mean": 0.3919270833333333, "rewards/reward_func/std": 0.3482829729715983, "sampling/importance_sampling_ratio/max": 2.9946084022521973, "sampling/importance_sampling_ratio/mean": 0.9581983685493469, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.114594459533691, "sampling/sampling_logp_difference/mean": 0.1707635223865509, "step": 232, "step_time": 186.30969157605432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3270.0, "completions/mean_length": 1146.046875, "completions/mean_terminated_length": 1056.482177734375, "completions/min_length": 47.0, "completions/min_terminated_length": 47.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6580791026353836, "epoch": 0.5738916256157636, "frac_reward_zero_std": 0.0, "grad_norm": 0.02936261451483538, "kl": 0.01962541602551937, "learning_rate": 4.761364865075402e-05, "loss": -0.09561189264059067, "num_tokens": 35105408.0, "reward": 3.94140625, "reward_std": 2.1033525466918945, "rewards/reward_func/mean": 0.4379340277777778, "rewards/reward_func/std": 0.3491085684961743, "sampling/importance_sampling_ratio/max": 2.9998505115509033, "sampling/importance_sampling_ratio/mean": 0.9548863768577576, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.624435424804688, "sampling/sampling_logp_difference/mean": 0.1729675531387329, "step": 233, "step_time": 127.6355293749366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2387.0, "completions/mean_length": 1181.0, "completions/mean_terminated_length": 965.26318359375, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6062022000551224, "epoch": 0.5763546798029556, "frac_reward_zero_std": 0.0, "grad_norm": 0.030717672720896462, "kl": 0.03280913829803467, "learning_rate": 4.7592922021922056e-05, "loss": -0.04321448132395744, "num_tokens": 35276032.0, "reward": 3.12109375, "reward_std": 2.1648309230804443, "rewards/reward_func/mean": 0.3467881944444444, "rewards/reward_func/std": 0.306332517001364, "sampling/importance_sampling_ratio/max": 2.9995665550231934, "sampling/importance_sampling_ratio/mean": 0.9498552083969116, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.928382873535156, "sampling/sampling_logp_difference/mean": 0.18584509193897247, "step": 234, "step_time": 145.97126659261994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2905.0, "completions/mean_length": 1276.90625, "completions/mean_terminated_length": 1030.436279296875, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6295378506183624, "epoch": 0.5788177339901478, "frac_reward_zero_std": 0.0, "grad_norm": 0.025004651439172708, "kl": 0.036536975763738155, "learning_rate": 4.757211032272141e-05, "loss": 0.016434896737337112, "num_tokens": 35441466.0, "reward": 3.7109375, "reward_std": 2.195873975753784, "rewards/reward_func/mean": 0.4123263888888889, "rewards/reward_func/std": 0.33889370991124046, "sampling/importance_sampling_ratio/max": 2.9977970123291016, "sampling/importance_sampling_ratio/mean": 0.9575643539428711, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.089038848876953, "sampling/sampling_logp_difference/mean": 0.166203573346138, "step": 235, "step_time": 124.31631009606645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2577.0, "completions/mean_length": 1064.4375, "completions/mean_terminated_length": 977.6392822265625, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5670924335718155, "epoch": 0.5812807881773399, "frac_reward_zero_std": 0.0, "grad_norm": 0.03182262502657222, "kl": 0.03349051624536514, "learning_rate": 4.75512136315155e-05, "loss": -0.07744449377059937, "num_tokens": 35599350.0, "reward": 4.21484375, "reward_std": 1.93026602268219, "rewards/reward_func/mean": 0.4683159722222222, "rewards/reward_func/std": 0.33124545713265735, "sampling/importance_sampling_ratio/max": 2.99531888961792, "sampling/importance_sampling_ratio/mean": 0.9617635011672974, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.312065124511719, "sampling/sampling_logp_difference/mean": 0.161124125123024, "step": 236, "step_time": 161.99643060285598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3437.0, "completions/mean_length": 1482.609375, "completions/mean_terminated_length": 1240.6207275390625, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5801850706338882, "epoch": 0.583743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.025051146875974366, "kl": 0.021959123201668262, "learning_rate": 4.7530232026987807e-05, "loss": 0.03501718491315842, "num_tokens": 35774285.0, "reward": 4.34765625, "reward_std": 1.9968314170837402, "rewards/reward_func/mean": 0.4830729166666667, "rewards/reward_func/std": 0.3292863344152768, "sampling/importance_sampling_ratio/max": 2.999497890472412, "sampling/importance_sampling_ratio/mean": 0.9603710174560547, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.829028129577637, "sampling/sampling_logp_difference/mean": 0.1585114598274231, "step": 237, "step_time": 135.68384832888842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3597.0, "completions/mean_length": 1484.640625, "completions/mean_terminated_length": 1111.58935546875, "completions/min_length": 211.0, "completions/min_terminated_length": 211.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6366446763277054, "epoch": 0.5862068965517241, "frac_reward_zero_std": 0.0, "grad_norm": 0.01741330264462691, "kl": 0.022137976717203856, "learning_rate": 4.75091655881415e-05, "loss": 0.06264235824346542, "num_tokens": 35944198.0, "reward": 4.78515625, "reward_std": 1.6170392036437988, "rewards/reward_func/mean": 0.5316840277777778, "rewards/reward_func/std": 0.260915905651119, "sampling/importance_sampling_ratio/max": 2.9989659786224365, "sampling/importance_sampling_ratio/mean": 0.9526357650756836, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.190343856811523, "sampling/sampling_logp_difference/mean": 0.18196536600589752, "step": 238, "step_time": 166.6468972011935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2563.0, "completions/mean_length": 1084.28125, "completions/mean_terminated_length": 889.0167236328125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6271532773971558, "epoch": 0.5886699507389163, "frac_reward_zero_std": 0.0, "grad_norm": 0.04962193586561231, "kl": 0.10968296322971582, "learning_rate": 4.7488014394299205e-05, "loss": -0.17306064069271088, "num_tokens": 36116088.0, "reward": 4.125, "reward_std": 1.8126540184020996, "rewards/reward_func/mean": 0.4583333333333333, "rewards/reward_func/std": 0.2737976892126931, "sampling/importance_sampling_ratio/max": 2.998816967010498, "sampling/importance_sampling_ratio/mean": 0.9479783177375793, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 20.247161865234375, "sampling/sampling_logp_difference/mean": 0.19447281956672668, "step": 239, "step_time": 173.26991621381603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3430.0, "completions/mean_length": 1172.03125, "completions/mean_terminated_length": 1077.7095947265625, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5948723554611206, "epoch": 0.5911330049261084, "frac_reward_zero_std": 0.0, "grad_norm": 0.026709746483499667, "kl": 0.03804912185296416, "learning_rate": 4.746677852510267e-05, "loss": -0.038154736161231995, "num_tokens": 36276522.0, "reward": 4.55859375, "reward_std": 1.704768180847168, "rewards/reward_func/mean": 0.5065104166666666, "rewards/reward_func/std": 0.268581575817532, "sampling/importance_sampling_ratio/max": 2.9988701343536377, "sampling/importance_sampling_ratio/mean": 0.9554129242897034, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.894377708435059, "sampling/sampling_logp_difference/mean": 0.17530812323093414, "step": 240, "step_time": 152.09504526108503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3526.0, "completions/max_terminated_length": 3526.0, "completions/mean_length": 1202.0, "completions/mean_terminated_length": 1201.6826171875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6431476175785065, "epoch": 0.5935960591133005, "frac_reward_zero_std": 0.0, "grad_norm": 0.025089827367739004, "kl": 0.018822600599378347, "learning_rate": 4.7445458060512484e-05, "loss": -0.08312501013278961, "num_tokens": 36449114.0, "reward": 4.50390625, "reward_std": 1.6970399618148804, "rewards/reward_func/mean": 0.5004340277777778, "rewards/reward_func/std": 0.3032427848213249, "sampling/importance_sampling_ratio/max": 2.9981064796447754, "sampling/importance_sampling_ratio/mean": 0.9516088366508484, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.683661460876465, "sampling/sampling_logp_difference/mean": 0.18970529735088348, "step": 241, "step_time": 157.17970843007788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2980.0, "completions/max_terminated_length": 2980.0, "completions/mean_length": 851.984375, "completions/mean_terminated_length": 851.984375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5084992349147797, "epoch": 0.5960591133004927, "frac_reward_zero_std": 0.0, "grad_norm": 0.02559628508870623, "kl": 0.019516848493367434, "learning_rate": 4.742405308080775e-05, "loss": 0.002505837008357048, "num_tokens": 36587897.0, "reward": 4.5390625, "reward_std": 1.5686782598495483, "rewards/reward_func/mean": 0.5043402777777778, "rewards/reward_func/std": 0.24406109833055073, "sampling/importance_sampling_ratio/max": 2.998159408569336, "sampling/importance_sampling_ratio/mean": 0.9689394235610962, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.643729209899902, "sampling/sampling_logp_difference/mean": 0.1409374475479126, "step": 242, "step_time": 91.55293344007805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3363.0, "completions/max_terminated_length": 3333.0, "completions/mean_length": 819.078125, "completions/mean_terminated_length": 778.698486328125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5793705433607101, "epoch": 0.5985221674876847, "frac_reward_zero_std": 0.25, "grad_norm": 0.06361807348388746, "kl": 0.153956466820091, "learning_rate": 4.7402563666585817e-05, "loss": 0.19218356907367706, "num_tokens": 36724926.0, "reward": 4.33203125, "reward_std": 1.7366580963134766, "rewards/reward_func/mean": 0.4813368055555556, "rewards/reward_func/std": 0.24346151699622473, "sampling/importance_sampling_ratio/max": 2.9986538887023926, "sampling/importance_sampling_ratio/mean": 0.9620853662490845, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.622957229614258, "sampling/sampling_logp_difference/mean": 0.16694733500480652, "step": 243, "step_time": 98.90079542505555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 2506.0, "completions/mean_length": 1158.15625, "completions/mean_terminated_length": 962.300048828125, "completions/min_length": 282.0, "completions/min_terminated_length": 282.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6488623917102814, "epoch": 0.6009852216748769, "frac_reward_zero_std": 0.0, "grad_norm": 0.024661429253724953, "kl": 0.019309990806505084, "learning_rate": 4.7380989898761957e-05, "loss": -0.28670600056648254, "num_tokens": 36882968.0, "reward": 4.1796875, "reward_std": 1.8900684118270874, "rewards/reward_func/mean": 0.4644097222222222, "rewards/reward_func/std": 0.27070868843131596, "sampling/importance_sampling_ratio/max": 2.983285427093506, "sampling/importance_sampling_ratio/mean": 0.9530576467514038, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.78699779510498, "sampling/sampling_logp_difference/mean": 0.17854541540145874, "step": 244, "step_time": 139.11421747365966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2858.0, "completions/mean_length": 876.515625, "completions/mean_terminated_length": 815.5, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6686416566371918, "epoch": 0.603448275862069, "frac_reward_zero_std": 0.0, "grad_norm": 0.03471602827656467, "kl": 0.02017925539985299, "learning_rate": 4.735933185856906e-05, "loss": -0.05181242153048515, "num_tokens": 37020313.0, "reward": 3.859375, "reward_std": 1.9779914617538452, "rewards/reward_func/mean": 0.4288194444444444, "rewards/reward_func/std": 0.3127517153819402, "sampling/importance_sampling_ratio/max": 2.994431972503662, "sampling/importance_sampling_ratio/mean": 0.9652891159057617, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.530597686767578, "sampling/sampling_logp_difference/mean": 0.16316168010234833, "step": 245, "step_time": 118.19300652644597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2223.0, "completions/mean_length": 1040.265625, "completions/mean_terminated_length": 939.4261474609375, "completions/min_length": 270.0, "completions/min_terminated_length": 270.0, "degenerate_groups_filtered": 0.0, "entropy": 0.645111545920372, "epoch": 0.6059113300492611, "frac_reward_zero_std": 0.0, "grad_norm": 0.036281618178556546, "kl": 0.022860198747366667, "learning_rate": 4.733758962755734e-05, "loss": 0.048598241060972214, "num_tokens": 37174378.0, "reward": 3.98046875, "reward_std": 1.978206992149353, "rewards/reward_func/mean": 0.4422743055555556, "rewards/reward_func/std": 0.29514625171820325, "sampling/importance_sampling_ratio/max": 2.9986579418182373, "sampling/importance_sampling_ratio/mean": 0.9547132849693298, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.925469398498535, "sampling/sampling_logp_difference/mean": 0.18641415238380432, "step": 246, "step_time": 123.81210570293479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3950.0, "completions/max_terminated_length": 3950.0, "completions/mean_length": 723.46875, "completions/mean_terminated_length": 723.46875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6742394268512726, "epoch": 0.6083743842364532, "frac_reward_zero_std": 0.0, "grad_norm": 0.04995747395315036, "kl": 0.026606498286128044, "learning_rate": 4.7315763287594e-05, "loss": 0.2653157711029053, "num_tokens": 37301944.0, "reward": 3.33984375, "reward_std": 2.105532646179199, "rewards/reward_func/mean": 0.37109375, "rewards/reward_func/std": 0.2963992158571879, "sampling/importance_sampling_ratio/max": 2.9936087131500244, "sampling/importance_sampling_ratio/mean": 0.9594826102256775, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.18620491027832, "sampling/sampling_logp_difference/mean": 0.182135671377182, "step": 247, "step_time": 103.0024021465797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 4061.0, "completions/mean_length": 1108.421875, "completions/mean_terminated_length": 974.11669921875, "completions/min_length": 204.0, "completions/min_terminated_length": 274.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6455037593841553, "epoch": 0.6108374384236454, "frac_reward_zero_std": 0.0, "grad_norm": 0.03148143550121695, "kl": 0.01934328768402338, "learning_rate": 4.729385292086297e-05, "loss": -0.24869614839553833, "num_tokens": 37464915.0, "reward": 3.55078125, "reward_std": 2.086036205291748, "rewards/reward_func/mean": 0.39453125, "rewards/reward_func/std": 0.2936388701200485, "sampling/importance_sampling_ratio/max": 2.9959442615509033, "sampling/importance_sampling_ratio/mean": 0.9551997780799866, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.284041404724121, "sampling/sampling_logp_difference/mean": 0.17975515127182007, "step": 248, "step_time": 145.99462776235305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2113.0, "completions/mean_length": 939.1875, "completions/mean_terminated_length": 837.3547973632812, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6875911951065063, "epoch": 0.6133004926108374, "frac_reward_zero_std": 0.0, "grad_norm": 0.03540754558977053, "kl": 0.026569989509880543, "learning_rate": 4.727185860986454e-05, "loss": 0.04270695894956589, "num_tokens": 37604927.0, "reward": 2.984375, "reward_std": 2.1193904876708984, "rewards/reward_func/mean": 0.3315972222222222, "rewards/reward_func/std": 0.2987919400135676, "sampling/importance_sampling_ratio/max": 2.998774290084839, "sampling/importance_sampling_ratio/mean": 0.957014799118042, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.659464836120605, "sampling/sampling_logp_difference/mean": 0.1871645301580429, "step": 249, "step_time": 121.0595855594147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3092.0, "completions/max_terminated_length": 3092.0, "completions/mean_length": 817.203125, "completions/mean_terminated_length": 817.203125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6271493583917618, "epoch": 0.6157635467980296, "frac_reward_zero_std": 0.0, "grad_norm": 0.040183803438341846, "kl": 0.022904008626937866, "learning_rate": 4.72497804374151e-05, "loss": 0.10319769382476807, "num_tokens": 37735276.0, "reward": 3.65625, "reward_std": 1.9535783529281616, "rewards/reward_func/mean": 0.40625, "rewards/reward_func/std": 0.27815084324942696, "sampling/importance_sampling_ratio/max": 2.9912922382354736, "sampling/importance_sampling_ratio/mean": 0.9657875299453735, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.99195671081543, "sampling/sampling_logp_difference/mean": 0.15472693741321564, "step": 250, "step_time": 92.37789764790796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3455.0, "completions/mean_length": 893.828125, "completions/mean_terminated_length": 843.0000610351562, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6401469260454178, "epoch": 0.6182266009852216, "frac_reward_zero_std": 0.0, "grad_norm": 0.03978805998415212, "kl": 0.03360833553597331, "learning_rate": 4.722761848664681e-05, "loss": 0.2213575392961502, "num_tokens": 37874577.0, "reward": 3.01171875, "reward_std": 2.0607969760894775, "rewards/reward_func/mean": 0.3346354166666667, "rewards/reward_func/std": 0.2820756352610058, "sampling/importance_sampling_ratio/max": 2.979861259460449, "sampling/importance_sampling_ratio/mean": 0.9564794301986694, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 8.897414207458496, "sampling/sampling_logp_difference/mean": 0.17922547459602356, "step": 251, "step_time": 146.73478505690582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 2845.0, "completions/max_terminated_length": 2845.0, "completions/mean_length": 720.3125, "completions/mean_terminated_length": 712.6290283203125, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6507443785667419, "epoch": 0.6206896551724138, "frac_reward_zero_std": 0.0, "grad_norm": 0.039921392666201184, "kl": 0.031218301504850388, "learning_rate": 4.720537284100728e-05, "loss": -0.04249424859881401, "num_tokens": 38001621.0, "reward": 3.25390625, "reward_std": 1.9757920503616333, "rewards/reward_func/mean": 0.3615451388888889, "rewards/reward_func/std": 0.2661890693836742, "sampling/importance_sampling_ratio/max": 2.9958386421203613, "sampling/importance_sampling_ratio/mean": 0.9622673392295837, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.243165969848633, "sampling/sampling_logp_difference/mean": 0.1693202704191208, "step": 252, "step_time": 101.49904671194963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2814.0, "completions/mean_length": 1095.40625, "completions/mean_terminated_length": 947.8359985351562, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5974663645029068, "epoch": 0.6231527093596059, "frac_reward_zero_std": 0.0, "grad_norm": 0.022696110143372545, "kl": 0.019222368020564318, "learning_rate": 4.7183043584259254e-05, "loss": -0.07331643998622894, "num_tokens": 38154367.0, "reward": 4.56640625, "reward_std": 1.6361746788024902, "rewards/reward_func/mean": 0.5073784722222222, "rewards/reward_func/std": 0.27994223973817295, "sampling/importance_sampling_ratio/max": 2.9847824573516846, "sampling/importance_sampling_ratio/mean": 0.9601361751556396, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.678767204284668, "sampling/sampling_logp_difference/mean": 0.1662071794271469, "step": 253, "step_time": 172.382303963881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 812.78125, "completions/mean_terminated_length": 812.78125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6090056896209717, "epoch": 0.625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.05873182180638249, "kl": 0.022160206688567996, "learning_rate": 4.716063080048031e-05, "loss": 0.07988754659891129, "num_tokens": 38288849.0, "reward": 3.99609375, "reward_std": 1.828723669052124, "rewards/reward_func/mean": 0.4440104166666667, "rewards/reward_func/std": 0.2540795885854297, "sampling/importance_sampling_ratio/max": 2.9953503608703613, "sampling/importance_sampling_ratio/mean": 0.9602220058441162, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.241506576538086, "sampling/sampling_logp_difference/mean": 0.1675276756286621, "step": 254, "step_time": 66.64111198205501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 2275.0, "completions/max_terminated_length": 2275.0, "completions/mean_length": 746.984375, "completions/mean_terminated_length": 746.984375, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5543633997440338, "epoch": 0.6280788177339901, "frac_reward_zero_std": 0.0, "grad_norm": 0.03127580001868004, "kl": 0.029117131140083075, "learning_rate": 4.713813457406253e-05, "loss": 0.04796233028173447, "num_tokens": 38418288.0, "reward": 4.15625, "reward_std": 1.6923614740371704, "rewards/reward_func/mean": 0.4618055555555556, "rewards/reward_func/std": 0.20467053850491843, "sampling/importance_sampling_ratio/max": 2.996886730194092, "sampling/importance_sampling_ratio/mean": 0.9699411392211914, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.242613792419434, "sampling/sampling_logp_difference/mean": 0.13953782618045807, "step": 255, "step_time": 103.97602935507894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2907.0, "completions/mean_length": 979.46875, "completions/mean_terminated_length": 878.9354858398438, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5546245723962784, "epoch": 0.6305418719211823, "frac_reward_zero_std": 0.0, "grad_norm": 0.029271437720684072, "kl": 0.02258498244918883, "learning_rate": 4.7115554989712185e-05, "loss": 0.043127041310071945, "num_tokens": 38572830.0, "reward": 4.13671875, "reward_std": 1.8316712379455566, "rewards/reward_func/mean": 0.4596354166666667, "rewards/reward_func/std": 0.2634606758753459, "sampling/importance_sampling_ratio/max": 2.999389171600342, "sampling/importance_sampling_ratio/mean": 0.9619561433792114, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.921653747558594, "sampling/sampling_logp_difference/mean": 0.15635137259960175, "step": 256, "step_time": 217.4995983429253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3510.0, "completions/mean_length": 893.671875, "completions/mean_terminated_length": 845.8524169921875, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "degenerate_groups_filtered": 0.0, "entropy": 0.665133610367775, "epoch": 0.6330049261083743, "frac_reward_zero_std": 0.0, "grad_norm": 0.03984192918620892, "kl": 0.046457535587251186, "learning_rate": 4.709289213244943e-05, "loss": 0.10279978066682816, "num_tokens": 38707145.0, "reward": 3.765625, "reward_std": 1.9659174680709839, "rewards/reward_func/mean": 0.4184027777777778, "rewards/reward_func/std": 0.2853093130720986, "sampling/importance_sampling_ratio/max": 2.995044708251953, "sampling/importance_sampling_ratio/mean": 0.9582663774490356, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.383708953857422, "sampling/sampling_logp_difference/mean": 0.17232593894004822, "step": 257, "step_time": 118.28496656008065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 4096.0, "completions/max_terminated_length": 1689.0, "completions/mean_length": 835.921875, "completions/mean_terminated_length": 784.1746215820312, "completions/min_length": 359.0, "completions/min_terminated_length": 359.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6838351041078568, "epoch": 0.6354679802955665, "frac_reward_zero_std": 0.25, "grad_norm": 0.03628155122006123, "kl": 0.04887444619089365, "learning_rate": 4.707014608760797e-05, "loss": -0.004470369778573513, "num_tokens": 38837156.0, "reward": 4.25390625, "reward_std": 1.6496105194091797, "rewards/reward_func/mean": 0.47265625, "rewards/reward_func/std": 0.21708844270971087, "sampling/importance_sampling_ratio/max": 2.9985907077789307, "sampling/importance_sampling_ratio/mean": 0.9622358083724976, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.900800704956055, "sampling/sampling_logp_difference/mean": 0.1803233027458191, "step": 258, "step_time": 123.32645462709479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3112.0, "completions/mean_length": 1010.75, "completions/mean_terminated_length": 957.8709106445312, "completions/min_length": 265.0, "completions/min_terminated_length": 265.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6722955405712128, "epoch": 0.6379310344827587, "frac_reward_zero_std": 0.0, "grad_norm": 0.028878722694508367, "kl": 0.02964442828670144, "learning_rate": 4.704731694083472e-05, "loss": -0.09307853877544403, "num_tokens": 38994644.0, "reward": 4.13671875, "reward_std": 1.8365392684936523, "rewards/reward_func/mean": 0.4596354166666667, "rewards/reward_func/std": 0.2944721562994851, "sampling/importance_sampling_ratio/max": 2.999586582183838, "sampling/importance_sampling_ratio/mean": 0.9555143117904663, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.124902725219727, "sampling/sampling_logp_difference/mean": 0.18143421411514282, "step": 259, "step_time": 159.91136281075887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3131.0, "completions/max_terminated_length": 3131.0, "completions/mean_length": 962.84375, "completions/mean_terminated_length": 968.6032104492188, "completions/min_length": 214.0, "completions/min_terminated_length": 214.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5780832916498184, "epoch": 0.6403940886699507, "frac_reward_zero_std": 0.0, "grad_norm": 0.13977232949753662, "kl": 0.02337419893592596, "learning_rate": 4.7024404778089535e-05, "loss": 0.2663920521736145, "num_tokens": 39152794.0, "reward": 4.54296875, "reward_std": 1.6020538806915283, "rewards/reward_func/mean": 0.5047743055555556, "rewards/reward_func/std": 0.25629327860143447, "sampling/importance_sampling_ratio/max": 2.9913201332092285, "sampling/importance_sampling_ratio/mean": 0.9610067009925842, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.624191284179688, "sampling/sampling_logp_difference/mean": 0.15946471691131592, "step": 260, "step_time": 93.53231204720214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2421.0, "completions/mean_length": 1025.125, "completions/mean_terminated_length": 974.4677124023438, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7002881020307541, "epoch": 0.6428571428571429, "frac_reward_zero_std": 0.0, "grad_norm": 0.02946249950049306, "kl": 0.02944745123386383, "learning_rate": 4.7001409685644824e-05, "loss": -0.18909524381160736, "num_tokens": 39310626.0, "reward": 4.3828125, "reward_std": 1.728592872619629, "rewards/reward_func/mean": 0.4869791666666667, "rewards/reward_func/std": 0.280566586388482, "sampling/importance_sampling_ratio/max": 2.998537540435791, "sampling/importance_sampling_ratio/mean": 0.952286958694458, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.772732734680176, "sampling/sampling_logp_difference/mean": 0.18932656943798065, "step": 261, "step_time": 139.19076150492765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3891.0, "completions/mean_length": 1219.21875, "completions/mean_terminated_length": 1049.6949462890625, "completions/min_length": 317.0, "completions/min_terminated_length": 317.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6386135667562485, "epoch": 0.645320197044335, "frac_reward_zero_std": 0.0, "grad_norm": 0.03738181468253387, "kl": 0.038119449745863676, "learning_rate": 4.697833175008528e-05, "loss": 0.21018671989440918, "num_tokens": 39465136.0, "reward": 4.01171875, "reward_std": 1.8694652318954468, "rewards/reward_func/mean": 0.4457465277777778, "rewards/reward_func/std": 0.29815065529611373, "sampling/importance_sampling_ratio/max": 2.9996824264526367, "sampling/importance_sampling_ratio/mean": 0.9597668647766113, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.762887954711914, "sampling/sampling_logp_difference/mean": 0.16622015833854675, "step": 262, "step_time": 129.9858180533629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2786.0, "completions/mean_length": 1279.234375, "completions/mean_terminated_length": 1076.4827880859375, "completions/min_length": 295.0, "completions/min_terminated_length": 295.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5774033218622208, "epoch": 0.6477832512315271, "frac_reward_zero_std": 0.0, "grad_norm": 0.02127924275094592, "kl": 0.020256227115169168, "learning_rate": 4.695517105830752e-05, "loss": -0.029037898406386375, "num_tokens": 39632367.0, "reward": 4.48046875, "reward_std": 1.5725224018096924, "rewards/reward_func/mean": 0.4978298611111111, "rewards/reward_func/std": 0.24498725765281254, "sampling/importance_sampling_ratio/max": 2.997105360031128, "sampling/importance_sampling_ratio/mean": 0.959444522857666, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.374399185180664, "sampling/sampling_logp_difference/mean": 0.16105125844478607, "step": 263, "step_time": 134.9195441652555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3584.0, "completions/mean_length": 1325.46875, "completions/mean_terminated_length": 1227.8333740234375, "completions/min_length": 421.0, "completions/min_terminated_length": 421.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6147435158491135, "epoch": 0.6502463054187192, "frac_reward_zero_std": 0.25, "grad_norm": 0.01633085128646359, "kl": 0.022814700147137046, "learning_rate": 4.6931927697519764e-05, "loss": -0.1795727014541626, "num_tokens": 39799197.0, "reward": 4.76171875, "reward_std": 1.5297510623931885, "rewards/reward_func/mean": 0.5290798611111112, "rewards/reward_func/std": 0.24876019855340323, "sampling/importance_sampling_ratio/max": 2.994166374206543, "sampling/importance_sampling_ratio/mean": 0.9519761204719543, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 21.247499465942383, "sampling/sampling_logp_difference/mean": 0.17912611365318298, "step": 264, "step_time": 176.04406498302706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2367.0, "completions/mean_length": 1012.0625, "completions/mean_terminated_length": 776.8643798828125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8938929438591003, "epoch": 0.6527093596059114, "frac_reward_zero_std": 0.0, "grad_norm": 0.02507752291342934, "kl": 0.026316776871681213, "learning_rate": 4.690860175524151e-05, "loss": -0.04639795422554016, "num_tokens": 39950705.0, "reward": 4.5, "reward_std": 1.4993385076522827, "rewards/reward_func/mean": 0.5, "rewards/reward_func/std": 0.21232767485909992, "sampling/importance_sampling_ratio/max": 2.997582197189331, "sampling/importance_sampling_ratio/mean": 0.95402991771698, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.430258750915527, "sampling/sampling_logp_difference/mean": 0.18939216434955597, "step": 265, "step_time": 193.00349966436625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2397.0, "completions/mean_length": 1005.28125, "completions/mean_terminated_length": 781.5689697265625, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6178922802209854, "epoch": 0.6551724137931034, "frac_reward_zero_std": 0.0, "grad_norm": 0.02221771680669154, "kl": 0.030900841113179922, "learning_rate": 4.688519331930321e-05, "loss": -0.07329220324754715, "num_tokens": 40096659.0, "reward": 4.40625, "reward_std": 1.6284303665161133, "rewards/reward_func/mean": 0.4895833333333333, "rewards/reward_func/std": 0.25941121329863864, "sampling/importance_sampling_ratio/max": 2.98852801322937, "sampling/importance_sampling_ratio/mean": 0.9561002254486084, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.55964469909668, "sampling/sampling_logp_difference/mean": 0.16870857775211334, "step": 266, "step_time": 172.4602910319809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 3033.0, "completions/max_terminated_length": 3033.0, "completions/mean_length": 964.5625, "completions/mean_terminated_length": 949.6826171875, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5864181816577911, "epoch": 0.6576354679802956, "frac_reward_zero_std": 0.25, "grad_norm": 0.027350824819978775, "kl": 0.021911869291216135, "learning_rate": 4.6861702477845924e-05, "loss": 0.0393938347697258, "num_tokens": 40235127.0, "reward": 4.68359375, "reward_std": 1.304176926612854, "rewards/reward_func/mean": 0.5203993055555556, "rewards/reward_func/std": 0.1879904866218567, "sampling/importance_sampling_ratio/max": 2.9985761642456055, "sampling/importance_sampling_ratio/mean": 0.9627262949943542, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 17.747222900390625, "sampling/sampling_logp_difference/mean": 0.14875806868076324, "step": 267, "step_time": 91.70279070711695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3110.0, "completions/mean_length": 951.640625, "completions/mean_terminated_length": 796.9999389648438, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6354534775018692, "epoch": 0.6600985221674877, "frac_reward_zero_std": 0.0, "grad_norm": 0.026974131553898818, "kl": 0.03585304832085967, "learning_rate": 4.683812931932103e-05, "loss": -0.014013536274433136, "num_tokens": 40384928.0, "reward": 4.3828125, "reward_std": 1.5157694816589355, "rewards/reward_func/mean": 0.4869791666666667, "rewards/reward_func/std": 0.21941063967016008, "sampling/importance_sampling_ratio/max": 2.994793176651001, "sampling/importance_sampling_ratio/mean": 0.9575902223587036, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.459250450134277, "sampling/sampling_logp_difference/mean": 0.1708674430847168, "step": 268, "step_time": 159.46288973209448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2444.0, "completions/mean_length": 1185.3125, "completions/mean_terminated_length": 934.8275756835938, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6692711859941483, "epoch": 0.6625615763546798, "frac_reward_zero_std": 0.0, "grad_norm": 0.022866791282923456, "kl": 0.02396966377273202, "learning_rate": 4.681447393248981e-05, "loss": 0.06689761579036713, "num_tokens": 40535876.0, "reward": 4.7734375, "reward_std": 1.4189189672470093, "rewards/reward_func/mean": 0.5303819444444444, "rewards/reward_func/std": 0.2291757870051596, "sampling/importance_sampling_ratio/max": 2.9907138347625732, "sampling/importance_sampling_ratio/mean": 0.9582573175430298, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.398053169250488, "sampling/sampling_logp_difference/mean": 0.1735519915819168, "step": 269, "step_time": 120.66145277698524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3855.0, "completions/mean_length": 1729.953125, "completions/mean_terminated_length": 1403.132080078125, "completions/min_length": 333.0, "completions/min_terminated_length": 333.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7276947647333145, "epoch": 0.6650246305418719, "frac_reward_zero_std": 0.0, "grad_norm": 0.014130147226078223, "kl": 0.023325514513999224, "learning_rate": 4.679073640642321e-05, "loss": -0.03754594177007675, "num_tokens": 40748321.0, "reward": 4.91796875, "reward_std": 1.3837597370147705, "rewards/reward_func/mean": 0.5464409722222222, "rewards/reward_func/std": 0.23101985620127785, "sampling/importance_sampling_ratio/max": 2.9986205101013184, "sampling/importance_sampling_ratio/mean": 0.9422482252120972, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.669647216796875, "sampling/sampling_logp_difference/mean": 0.20420311391353607, "step": 270, "step_time": 199.377461778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4010.0, "completions/mean_length": 1326.546875, "completions/mean_terminated_length": 1076.350830078125, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5661157891154289, "epoch": 0.6674876847290641, "frac_reward_zero_std": 0.0, "grad_norm": 0.025973586434166324, "kl": 0.01932303886860609, "learning_rate": 4.676691683050142e-05, "loss": 0.06925665587186813, "num_tokens": 40915540.0, "reward": 4.37109375, "reward_std": 1.922089695930481, "rewards/reward_func/mean": 0.4856770833333333, "rewards/reward_func/std": 0.2948591311772664, "sampling/importance_sampling_ratio/max": 2.9921767711639404, "sampling/importance_sampling_ratio/mean": 0.9636217355728149, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 22.38996696472168, "sampling/sampling_logp_difference/mean": 0.14760522544384003, "step": 271, "step_time": 133.28347809705883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3370.0, "completions/mean_length": 1276.25, "completions/mean_terminated_length": 1048.17236328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "degenerate_groups_filtered": 0.0, "entropy": 0.808114305138588, "epoch": 0.6699507389162561, "frac_reward_zero_std": 0.0, "grad_norm": 0.024293892150334624, "kl": 0.03092290833592415, "learning_rate": 4.6743015294413606e-05, "loss": -0.10823698341846466, "num_tokens": 41080932.0, "reward": 4.44921875, "reward_std": 1.7689896821975708, "rewards/reward_func/mean": 0.4943576388888889, "rewards/reward_func/std": 0.2751389775011275, "sampling/importance_sampling_ratio/max": 2.9939756393432617, "sampling/importance_sampling_ratio/mean": 0.9468337893486023, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.56399917602539, "sampling/sampling_logp_difference/mean": 0.1977367103099823, "step": 272, "step_time": 132.46389366080984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3550.0, "completions/mean_length": 1582.265625, "completions/mean_terminated_length": 1304.0726318359375, "completions/min_length": 318.0, "completions/min_terminated_length": 318.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6703736484050751, "epoch": 0.6724137931034483, "frac_reward_zero_std": 0.0, "grad_norm": 0.022956701309354365, "kl": 0.02878654282540083, "learning_rate": 4.671903188815754e-05, "loss": -0.15021948516368866, "num_tokens": 41272757.0, "reward": 4.14453125, "reward_std": 1.8876187801361084, "rewards/reward_func/mean": 0.4605034722222222, "rewards/reward_func/std": 0.28286059117979473, "sampling/importance_sampling_ratio/max": 2.9985098838806152, "sampling/importance_sampling_ratio/mean": 0.9500502943992615, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.494309425354004, "sampling/sampling_logp_difference/mean": 0.1767929345369339, "step": 273, "step_time": 130.30867488658987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2626.0, "completions/mean_length": 1213.53125, "completions/mean_terminated_length": 966.8070068359375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "degenerate_groups_filtered": 0.0, "entropy": 0.627654418349266, "epoch": 0.6748768472906403, "frac_reward_zero_std": 0.0, "grad_norm": 0.018783698500290817, "kl": 0.027171143796294928, "learning_rate": 4.6694966702039236e-05, "loss": -0.1029772013425827, "num_tokens": 41432935.0, "reward": 4.57421875, "reward_std": 1.6303664445877075, "rewards/reward_func/mean": 0.5082465277777778, "rewards/reward_func/std": 0.24990240236123404, "sampling/importance_sampling_ratio/max": 2.991697311401367, "sampling/importance_sampling_ratio/mean": 0.954232931137085, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 19.955520629882812, "sampling/sampling_logp_difference/mean": 0.16991396248340607, "step": 274, "step_time": 132.13556932890788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.15625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3856.0, "completions/mean_length": 1401.796875, "completions/mean_terminated_length": 1125.388916015625, "completions/min_length": 341.0, "completions/min_terminated_length": 341.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7419492900371552, "epoch": 0.6773399014778325, "frac_reward_zero_std": 0.25, "grad_norm": 0.009325814947082865, "kl": 0.01900778664276004, "learning_rate": 4.667081982667269e-05, "loss": -0.07872651517391205, "num_tokens": 41599946.0, "reward": 4.84375, "reward_std": 1.2476167678833008, "rewards/reward_func/mean": 0.5381944444444444, "rewards/reward_func/std": 0.20155664533376694, "sampling/importance_sampling_ratio/max": 2.9985573291778564, "sampling/importance_sampling_ratio/mean": 0.957183837890625, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.758742332458496, "sampling/sampling_logp_difference/mean": 0.17596955597400665, "step": 275, "step_time": 126.71464370447211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 4096.0, "completions/max_terminated_length": 3458.0, "completions/mean_length": 995.3125, "completions/mean_terminated_length": 788.6000366210938, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.675742506980896, "epoch": 0.6798029556650246, "frac_reward_zero_std": 0.0, "grad_norm": 0.024999889994407377, "kl": 0.03196124825626612, "learning_rate": 4.6646591352979416e-05, "loss": 0.16251075267791748, "num_tokens": 41744126.0, "reward": 4.71484375, "reward_std": 1.3450874090194702, "rewards/reward_func/mean": 0.5238715277777778, "rewards/reward_func/std": 0.22701033618715075, "sampling/importance_sampling_ratio/max": 2.9886231422424316, "sampling/importance_sampling_ratio/mean": 0.9616168141365051, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.233985900878906, "sampling/sampling_logp_difference/mean": 0.1758139729499817, "step": 276, "step_time": 126.53751606796868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2902.0, "completions/mean_length": 1149.875, "completions/mean_terminated_length": 889.0178833007812, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6683462113142014, "epoch": 0.6822660098522167, "frac_reward_zero_std": 0.25, "grad_norm": 0.032914011113616616, "kl": 0.04408724885433912, "learning_rate": 4.6622281372188246e-05, "loss": 0.1322198510169983, "num_tokens": 41903958.0, "reward": 4.390625, "reward_std": 1.5898206233978271, "rewards/reward_func/mean": 0.4878472222222222, "rewards/reward_func/std": 0.2218614485528734, "sampling/importance_sampling_ratio/max": 2.99843168258667, "sampling/importance_sampling_ratio/mean": 0.9556390643119812, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.684946060180664, "sampling/sampling_logp_difference/mean": 0.17735861241817474, "step": 277, "step_time": 135.2150932047516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3271.0, "completions/mean_length": 1111.453125, "completions/mean_terminated_length": 1061.3709716796875, "completions/min_length": 346.0, "completions/min_terminated_length": 346.0, "degenerate_groups_filtered": 0.0, "entropy": 0.56987564265728, "epoch": 0.6847290640394089, "frac_reward_zero_std": 0.25, "grad_norm": 0.021262043963326146, "kl": 0.03181112464517355, "learning_rate": 4.6597889975834884e-05, "loss": 0.01715017482638359, "num_tokens": 42067075.0, "reward": 4.859375, "reward_std": 1.3553794622421265, "rewards/reward_func/mean": 0.5399305555555556, "rewards/reward_func/std": 0.22037654287285274, "sampling/importance_sampling_ratio/max": 2.99056077003479, "sampling/importance_sampling_ratio/mean": 0.9574146270751953, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.489864349365234, "sampling/sampling_logp_difference/mean": 0.1564047932624817, "step": 278, "step_time": 141.53353557991795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3224.0, "completions/mean_length": 1250.09375, "completions/mean_terminated_length": 1062.586181640625, "completions/min_length": 329.0, "completions/min_terminated_length": 329.0, "degenerate_groups_filtered": 0.0, "entropy": 0.8913148045539856, "epoch": 0.687192118226601, "frac_reward_zero_std": 0.25, "grad_norm": 0.019608777025347725, "kl": 0.02684358460828662, "learning_rate": 4.657341725576159e-05, "loss": -0.09490776062011719, "num_tokens": 42239385.0, "reward": 4.86328125, "reward_std": 1.4522353410720825, "rewards/reward_func/mean": 0.5403645833333334, "rewards/reward_func/std": 0.25109441661172444, "sampling/importance_sampling_ratio/max": 2.9914705753326416, "sampling/importance_sampling_ratio/mean": 0.9516316652297974, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.771811485290527, "sampling/sampling_logp_difference/mean": 0.18974481523036957, "step": 279, "step_time": 131.01170259085484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2789.0, "completions/mean_length": 1550.015625, "completions/mean_terminated_length": 1175.8431396484375, "completions/min_length": 311.0, "completions/min_terminated_length": 311.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6808439493179321, "epoch": 0.6896551724137931, "frac_reward_zero_std": 0.0, "grad_norm": 0.024437911228958144, "kl": 0.025552792474627495, "learning_rate": 4.654886330411682e-05, "loss": -0.004699625074863434, "num_tokens": 42423594.0, "reward": 4.0078125, "reward_std": 2.062980890274048, "rewards/reward_func/mean": 0.4453125, "rewards/reward_func/std": 0.3020985300342242, "sampling/importance_sampling_ratio/max": 2.998654365539551, "sampling/importance_sampling_ratio/mean": 0.9476783275604248, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 16.08974266052246, "sampling/sampling_logp_difference/mean": 0.18657398223876953, "step": 280, "step_time": 137.77152940188535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3953.0, "completions/mean_length": 1437.078125, "completions/mean_terminated_length": 1263.9830322265625, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7709734290838242, "epoch": 0.6921182266009852, "frac_reward_zero_std": 0.0, "grad_norm": 0.02035823015500469, "kl": 0.023151292465627193, "learning_rate": 4.6524228213354935e-05, "loss": -0.13394278287887573, "num_tokens": 42607135.0, "reward": 4.65234375, "reward_std": 1.4790985584259033, "rewards/reward_func/mean": 0.5169270833333334, "rewards/reward_func/std": 0.2436904509862264, "sampling/importance_sampling_ratio/max": 2.998265266418457, "sampling/importance_sampling_ratio/mean": 0.9457845687866211, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.194257736206055, "sampling/sampling_logp_difference/mean": 0.20592188835144043, "step": 281, "step_time": 127.69270811229944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 4053.0, "completions/mean_length": 1238.421875, "completions/mean_terminated_length": 1022.4035034179688, "completions/min_length": 368.0, "completions/min_terminated_length": 368.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6019101142883301, "epoch": 0.6945812807881774, "frac_reward_zero_std": 0.0, "grad_norm": 0.025745355691416272, "kl": 0.034187129233032465, "learning_rate": 4.649951207623579e-05, "loss": 0.05697305127978325, "num_tokens": 42769034.0, "reward": 4.66796875, "reward_std": 1.4967256784439087, "rewards/reward_func/mean": 0.5186631944444444, "rewards/reward_func/std": 0.23547837634881338, "sampling/importance_sampling_ratio/max": 2.9987599849700928, "sampling/importance_sampling_ratio/mean": 0.9599939584732056, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.365039825439453, "sampling/sampling_logp_difference/mean": 0.1549825668334961, "step": 282, "step_time": 139.72987941768952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3680.0, "completions/mean_length": 1312.171875, "completions/mean_terminated_length": 1049.9285888671875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7229396849870682, "epoch": 0.6970443349753694, "frac_reward_zero_std": 0.25, "grad_norm": 0.02292364974505966, "kl": 0.02232949109748006, "learning_rate": 4.647471498582441e-05, "loss": 0.04625914245843887, "num_tokens": 42932885.0, "reward": 4.8671875, "reward_std": 1.4029226303100586, "rewards/reward_func/mean": 0.5407986111111112, "rewards/reward_func/std": 0.23832206345266765, "sampling/importance_sampling_ratio/max": 2.9951465129852295, "sampling/importance_sampling_ratio/mean": 0.9536824226379395, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.12485408782959, "sampling/sampling_logp_difference/mean": 0.18273432552814484, "step": 283, "step_time": 134.87993472395465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2364.0, "completions/mean_length": 1056.125, "completions/mean_terminated_length": 828.7118530273438, "completions/min_length": 302.0, "completions/min_terminated_length": 302.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7263010889291763, "epoch": 0.6995073891625616, "frac_reward_zero_std": 0.25, "grad_norm": 0.020764056886669293, "kl": 0.03459874168038368, "learning_rate": 4.644983703549063e-05, "loss": 0.014433782547712326, "num_tokens": 43095389.0, "reward": 4.80859375, "reward_std": 1.465030312538147, "rewards/reward_func/mean": 0.5342881944444444, "rewards/reward_func/std": 0.2360355622238583, "sampling/importance_sampling_ratio/max": 2.9997096061706543, "sampling/importance_sampling_ratio/mean": 0.9541752934455872, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.002429008483887, "sampling/sampling_logp_difference/mean": 0.18615329265594482, "step": 284, "step_time": 130.1994199147448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3178.0, "completions/mean_length": 1032.71875, "completions/mean_terminated_length": 770.1929931640625, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "degenerate_groups_filtered": 0.0, "entropy": 0.5880027413368225, "epoch": 0.7019704433497537, "frac_reward_zero_std": 0.25, "grad_norm": 0.02041600877303681, "kl": 0.028392331209033728, "learning_rate": 4.642487831890878e-05, "loss": -0.03497108072042465, "num_tokens": 43233963.0, "reward": 4.52734375, "reward_std": 1.5520833730697632, "rewards/reward_func/mean": 0.5030381944444444, "rewards/reward_func/std": 0.239333333240615, "sampling/importance_sampling_ratio/max": 2.9885475635528564, "sampling/importance_sampling_ratio/mean": 0.9662536382675171, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.289167404174805, "sampling/sampling_logp_difference/mean": 0.1472897231578827, "step": 285, "step_time": 155.66030428768136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.109375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3764.0, "completions/mean_length": 1746.390625, "completions/mean_terminated_length": 1538.9649658203125, "completions/min_length": 325.0, "completions/min_terminated_length": 325.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7788778990507126, "epoch": 0.7044334975369458, "frac_reward_zero_std": 0.0, "grad_norm": 0.024984738066231114, "kl": 0.02480602590367198, "learning_rate": 4.639983893005728e-05, "loss": 0.00569998100399971, "num_tokens": 43435364.0, "reward": 4.0234375, "reward_std": 1.9383958578109741, "rewards/reward_func/mean": 0.4470486111111111, "rewards/reward_func/std": 0.2939976685576969, "sampling/importance_sampling_ratio/max": 2.9908130168914795, "sampling/importance_sampling_ratio/mean": 0.9435839056968689, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.895567893981934, "sampling/sampling_logp_difference/mean": 0.2047470062971115, "step": 286, "step_time": 201.04906010790728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2403.0, "completions/mean_length": 1204.6875, "completions/mean_terminated_length": 1040.389892578125, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7257010787725449, "epoch": 0.7068965517241379, "frac_reward_zero_std": 0.0, "grad_norm": 0.016581405906852593, "kl": 0.02703442983329296, "learning_rate": 4.6374718963218306e-05, "loss": -0.1384935826063156, "num_tokens": 43601968.0, "reward": 4.8125, "reward_std": 1.2198750972747803, "rewards/reward_func/mean": 0.5347222222222222, "rewards/reward_func/std": 0.20516829854912227, "sampling/importance_sampling_ratio/max": 2.9976420402526855, "sampling/importance_sampling_ratio/mean": 0.9500475525856018, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.646903038024902, "sampling/sampling_logp_difference/mean": 0.18634405732154846, "step": 287, "step_time": 125.75703874812461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3669.0, "completions/mean_length": 1340.328125, "completions/mean_terminated_length": 1081.2679443359375, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6680119037628174, "epoch": 0.7093596059113301, "frac_reward_zero_std": 0.0, "grad_norm": 0.07098619776798472, "kl": 0.026068232487887144, "learning_rate": 4.6349518512977454e-05, "loss": -0.17846474051475525, "num_tokens": 43777477.0, "reward": 4.42578125, "reward_std": 1.4811512231826782, "rewards/reward_func/mean": 0.4917534722222222, "rewards/reward_func/std": 0.2245293590757582, "sampling/importance_sampling_ratio/max": 2.9975290298461914, "sampling/importance_sampling_ratio/mean": 0.9534498453140259, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.924224853515625, "sampling/sampling_logp_difference/mean": 0.17526039481163025, "step": 288, "step_time": 133.57338830200024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 2947.0, "completions/mean_length": 1322.5, "completions/mean_terminated_length": 1108.034423828125, "completions/min_length": 301.0, "completions/min_terminated_length": 301.0, "degenerate_groups_filtered": 0.0, "entropy": 0.70571568608284, "epoch": 0.7118226600985221, "frac_reward_zero_std": 0.0, "grad_norm": 0.023923208279176593, "kl": 0.02546512195840478, "learning_rate": 4.632423767422335e-05, "loss": -0.15840458869934082, "num_tokens": 43956341.0, "reward": 4.46484375, "reward_std": 1.6849398612976074, "rewards/reward_func/mean": 0.49609375, "rewards/reward_func/std": 0.2616619947883818, "sampling/importance_sampling_ratio/max": 2.999711036682129, "sampling/importance_sampling_ratio/mean": 0.9472851753234863, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.349294662475586, "sampling/sampling_logp_difference/mean": 0.19516421854496002, "step": 289, "step_time": 137.85350774601102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3769.0, "completions/mean_length": 939.484375, "completions/mean_terminated_length": 837.6612548828125, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6920178681612015, "epoch": 0.7142857142857143, "frac_reward_zero_std": 0.0, "grad_norm": 0.026666690284316723, "kl": 0.0283396546728909, "learning_rate": 4.629887654214735e-05, "loss": -0.08653214573860168, "num_tokens": 44113716.0, "reward": 4.62109375, "reward_std": 1.396560549736023, "rewards/reward_func/mean": 0.5134548611111112, "rewards/reward_func/std": 0.21504902177386814, "sampling/importance_sampling_ratio/max": 2.999668598175049, "sampling/importance_sampling_ratio/mean": 0.9582748413085938, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 11.547659873962402, "sampling/sampling_logp_difference/mean": 0.17689433693885803, "step": 290, "step_time": 129.55061276513152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 4026.0, "completions/mean_length": 1164.671875, "completions/mean_terminated_length": 1070.1129150390625, "completions/min_length": 52.0, "completions/min_terminated_length": 52.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6513230800628662, "epoch": 0.7167487684729064, "frac_reward_zero_std": 0.0, "grad_norm": 0.02294401858939595, "kl": 0.02601558994501829, "learning_rate": 4.627343521224308e-05, "loss": 0.025205962359905243, "num_tokens": 44268895.0, "reward": 4.85546875, "reward_std": 1.494570016860962, "rewards/reward_func/mean": 0.5394965277777778, "rewards/reward_func/std": 0.23487240738338894, "sampling/importance_sampling_ratio/max": 2.9996352195739746, "sampling/importance_sampling_ratio/mean": 0.9548361301422119, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 15.297969818115234, "sampling/sampling_logp_difference/mean": 0.1707584261894226, "step": 291, "step_time": 130.5793014159426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 1857.0, "completions/max_terminated_length": 1857.0, "completions/mean_length": 640.78125, "completions/mean_terminated_length": 642.920654296875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6324314475059509, "epoch": 0.7192118226600985, "frac_reward_zero_std": 0.25, "grad_norm": 0.030702276104398882, "kl": 0.034378912299871445, "learning_rate": 4.62479137803062e-05, "loss": 0.026865554973483086, "num_tokens": 44394545.0, "reward": 4.66796875, "reward_std": 1.4692987203598022, "rewards/reward_func/mean": 0.5186631944444444, "rewards/reward_func/std": 0.22203164630466038, "sampling/importance_sampling_ratio/max": 2.9980506896972656, "sampling/importance_sampling_ratio/mean": 0.9679237604141235, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.514067649841309, "sampling/sampling_logp_difference/mean": 0.14799155294895172, "step": 292, "step_time": 64.88541042688303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3588.0, "completions/mean_length": 1286.625, "completions/mean_terminated_length": 1052.0172119140625, "completions/min_length": 342.0, "completions/min_terminated_length": 342.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7215117067098618, "epoch": 0.7216748768472906, "frac_reward_zero_std": 0.0, "grad_norm": 0.06081784231964765, "kl": 0.0315888044424355, "learning_rate": 4.6222312342433946e-05, "loss": -0.08961963653564453, "num_tokens": 44563561.0, "reward": 4.25390625, "reward_std": 1.7454545497894287, "rewards/reward_func/mean": 0.47265625, "rewards/reward_func/std": 0.2583325778444608, "sampling/importance_sampling_ratio/max": 2.9989945888519287, "sampling/importance_sampling_ratio/mean": 0.9483383297920227, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.186487197875977, "sampling/sampling_logp_difference/mean": 0.19600994884967804, "step": 293, "step_time": 164.69752652896568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2889.0, "completions/mean_length": 989.859375, "completions/mean_terminated_length": 889.6612548828125, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7307186424732208, "epoch": 0.7241379310344828, "frac_reward_zero_std": 0.25, "grad_norm": 0.024207226420314105, "kl": 0.04108696198090911, "learning_rate": 4.6196630995024836e-05, "loss": -0.027970120310783386, "num_tokens": 44710624.0, "reward": 4.96484375, "reward_std": 1.0836695432662964, "rewards/reward_func/mean": 0.5516493055555556, "rewards/reward_func/std": 0.19183417658011118, "sampling/importance_sampling_ratio/max": 2.995861530303955, "sampling/importance_sampling_ratio/mean": 0.9493570327758789, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.687049865722656, "sampling/sampling_logp_difference/mean": 0.19344517588615417, "step": 294, "step_time": 143.77042742003687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 3193.0, "completions/mean_length": 1220.4375, "completions/mean_terminated_length": 1079.016357421875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "degenerate_groups_filtered": 0.0, "entropy": 0.7118326276540756, "epoch": 0.7266009852216748, "frac_reward_zero_std": 0.0, "grad_norm": 0.03225832050555783, "kl": 0.020965780597180128, "learning_rate": 4.617086983477823e-05, "loss": 0.16336123645305634, "num_tokens": 44870332.0, "reward": 4.57421875, "reward_std": 1.6497231721878052, "rewards/reward_func/mean": 0.5082465277777778, "rewards/reward_func/std": 0.25509046349260545, "sampling/importance_sampling_ratio/max": 2.997741460800171, "sampling/importance_sampling_ratio/mean": 0.955689013004303, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 14.111473083496094, "sampling/sampling_logp_difference/mean": 0.1800614446401596, "step": 295, "step_time": 126.94880920927972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.046875, "completions/max_length": 4096.0, "completions/max_terminated_length": 2924.0, "completions/mean_length": 1025.46875, "completions/mean_terminated_length": 874.458984375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "degenerate_groups_filtered": 1.0, "entropy": 0.7754890322685242, "epoch": 0.729064039408867, "frac_reward_zero_std": 0.25, "grad_norm": 0.02781327199812679, "kl": 0.025280939415097237, "learning_rate": 4.614502895869405e-05, "loss": 0.048563357442617416, "num_tokens": 45021690.0, "reward": 4.76171875, "reward_std": 1.0795994997024536, "rewards/reward_func/mean": 0.5290798611111112, "rewards/reward_func/std": 0.1537442902723948, "sampling/importance_sampling_ratio/max": 2.9947688579559326, "sampling/importance_sampling_ratio/mean": 0.9558762311935425, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.272590637207031, "sampling/sampling_logp_difference/mean": 0.19144631922245026, "step": 296, "step_time": 121.46926600020379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.09375, "completions/max_length": 4096.0, "completions/max_terminated_length": 3127.0, "completions/mean_length": 1396.6875, "completions/mean_terminated_length": 1191.2930908203125, "completions/min_length": 353.0, "completions/min_terminated_length": 353.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6622474938631058, "epoch": 0.7315270935960592, "frac_reward_zero_std": 0.0, "grad_norm": 0.0225800041391618, "kl": 0.03178559988737106, "learning_rate": 4.611910846407237e-05, "loss": -0.0789480060338974, "num_tokens": 45210694.0, "reward": 4.3203125, "reward_std": 1.7953139543533325, "rewards/reward_func/mean": 0.4800347222222222, "rewards/reward_func/std": 0.26654813935359317, "sampling/importance_sampling_ratio/max": 2.9977633953094482, "sampling/importance_sampling_ratio/mean": 0.9434003233909607, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 10.753666877746582, "sampling/sampling_logp_difference/mean": 0.19646012783050537, "step": 297, "step_time": 142.21653381781653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 3221.0, "completions/mean_length": 1305.640625, "completions/mean_terminated_length": 1179.59326171875, "completions/min_length": 345.0, "completions/min_terminated_length": 345.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6847527176141739, "epoch": 0.7339901477832512, "frac_reward_zero_std": 0.25, "grad_norm": 0.013272849126583471, "kl": 0.02359460387378931, "learning_rate": 4.6093108448513035e-05, "loss": -0.038221895694732666, "num_tokens": 45383807.0, "reward": 4.9921875, "reward_std": 0.9225662350654602, "rewards/reward_func/mean": 0.5546875, "rewards/reward_func/std": 0.17556441244151857, "sampling/importance_sampling_ratio/max": 2.994771718978882, "sampling/importance_sampling_ratio/mean": 0.9457255601882935, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 12.194232940673828, "sampling/sampling_logp_difference/mean": 0.19551457464694977, "step": 298, "step_time": 129.38853779318742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.078125, "completions/max_length": 4096.0, "completions/max_terminated_length": 2380.0, "completions/mean_length": 946.296875, "completions/mean_terminated_length": 727.796630859375, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6168168187141418, "epoch": 0.7364532019704434, "frac_reward_zero_std": 0.0, "grad_norm": 0.024650196102857035, "kl": 0.03336963150650263, "learning_rate": 4.6067029009915345e-05, "loss": -0.03867091238498688, "num_tokens": 45527538.0, "reward": 4.65234375, "reward_std": 1.3646742105484009, "rewards/reward_func/mean": 0.5169270833333334, "rewards/reward_func/std": 0.21994754672050476, "sampling/importance_sampling_ratio/max": 2.9881489276885986, "sampling/importance_sampling_ratio/mean": 0.9603166580200195, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 13.246369361877441, "sampling/sampling_logp_difference/mean": 0.16486753523349762, "step": 299, "step_time": 125.68861723900773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.015625, "completions/max_length": 2524.0, "completions/max_terminated_length": 2524.0, "completions/mean_length": 769.390625, "completions/mean_terminated_length": 759.761962890625, "completions/min_length": 248.0, "completions/min_terminated_length": 248.0, "degenerate_groups_filtered": 0.0, "entropy": 0.6726339608430862, "epoch": 0.7389162561576355, "frac_reward_zero_std": 0.0, "grad_norm": 0.13877986473783605, "kl": 0.03055637050420046, "learning_rate": 4.6040870246477636e-05, "loss": 0.08779981732368469, "num_tokens": 45658923.0, "reward": 4.68359375, "reward_std": 1.3896172046661377, "rewards/reward_func/mean": 0.5203993055555556, "rewards/reward_func/std": 0.22621763911512163, "sampling/importance_sampling_ratio/max": 2.9945335388183594, "sampling/importance_sampling_ratio/mean": 0.9598626494407654, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 9.124909400939941, "sampling/sampling_logp_difference/mean": 0.16426949203014374, "step": 300, "step_time": 80.3105343640782 } ], "logging_steps": 1, "max_steps": 1624, "num_input_tokens_seen": 45658923, "num_train_epochs": 4, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 4, "trial_name": null, "trial_params": null }