{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.01125, "eval_steps": 500, "global_step": 1125, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 533.0, "completions/max_terminated_length": 533.0, "completions/mean_length": 40.0625, "completions/mean_terminated_length": 41.66666793823242, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.461312234401703, "epoch": 1e-05, "frac_reward_zero_std": 0.25, "grad_norm": 0.8767255544662476, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0055, "num_tokens": 32445.0, "reward": 2.3419277667999268, "reward_std": 2.033935546875, "rewards/rollout_reward_func/mean": 2.3419277667999268, "rewards/rollout_reward_func/std": 2.88342547416687, "sampling/importance_sampling_ratio/max": 0.568917989730835, "sampling/importance_sampling_ratio/mean": 0.4008851647377014, "sampling/importance_sampling_ratio/min": 0.021672284230589867, "sampling/sampling_logp_difference/max": 2.025857925415039, "sampling/sampling_logp_difference/mean": 0.2979532480239868, "step": 1, "step_time": 8.029544077000537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.461312234401703, "epoch": 2e-05, "grad_norm": 0.8794809579849243, "kl": 0.0, "learning_rate": 2.8571428571428575e-07, "loss": -0.0055, "step": 2, "step_time": 4.196975900000325 }, { "clip_ratio/high_max": 0.07386363670229912, "clip_ratio/high_mean": 0.03693181835114956, "clip_ratio/low_mean": 0.019570707343518734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.056502525229007006, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 267.75, "completions/mean_terminated_length": 267.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.0148856043815613, "epoch": 3e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.37352433800697327, "kl": 0.03585608473258617, "learning_rate": 5.714285714285715e-07, "loss": 0.0043, "num_tokens": 79195.0, "reward": -0.19015735387802124, "reward_std": 2.6679632663726807, "rewards/rollout_reward_func/mean": -0.19015735387802124, "rewards/rollout_reward_func/std": 3.2658557891845703, "sampling/importance_sampling_ratio/max": 0.5172383189201355, "sampling/importance_sampling_ratio/mean": 0.2352834939956665, "sampling/importance_sampling_ratio/min": 0.007029580418020487, "sampling/sampling_logp_difference/max": 2.1682968139648438, "sampling/sampling_logp_difference/mean": 0.4327928423881531, "step": 3, "step_time": 9.101961994000703 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 3.0205088555812836, "epoch": 4e-05, "grad_norm": 0.4964880049228668, "kl": 0.019733898581762332, "learning_rate": 8.571428571428572e-07, "loss": 0.0042, "step": 4, "step_time": 5.269142409000779 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "completions/clipped_ratio": 0.03125, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 281.71875, "completions/mean_terminated_length": 272.8709716796875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7891452312469482, "epoch": 5e-05, "frac_reward_zero_std": 0.0, "grad_norm": 0.5345627665519714, "kl": 0.010214891051873565, "learning_rate": 1.142857142857143e-06, "loss": -0.0117, "num_tokens": 125982.0, "reward": 0.871216356754303, "reward_std": 2.4055609703063965, "rewards/rollout_reward_func/mean": 0.871216356754303, "rewards/rollout_reward_func/std": 3.3557357788085938, "sampling/importance_sampling_ratio/max": 0.5932077169418335, "sampling/importance_sampling_ratio/mean": 0.26146990060806274, "sampling/importance_sampling_ratio/min": 0.008206531405448914, "sampling/sampling_logp_difference/max": 1.6717329025268555, "sampling/sampling_logp_difference/mean": 0.34865662455558777, "step": 5, "step_time": 8.276885944000242 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.027777778450399637, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.039709596429020166, "entropy": 2.776824176311493, "epoch": 6e-05, "grad_norm": 0.4867320954799652, "kl": 0.00551642047776113, "learning_rate": 1.4285714285714286e-06, "loss": -0.0119, "step": 6, "step_time": 4.503357092999977 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0033783784601837397, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009060196811333299, "completions/clipped_ratio": 0.03125, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 176.65625, "completions/mean_terminated_length": 181.32257080078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5317403972148895, "epoch": 7e-05, "frac_reward_zero_std": 0.5, "grad_norm": 0.1703971028327942, "kl": 0.0028867512110082316, "learning_rate": 1.7142857142857145e-06, "loss": -0.0029, "num_tokens": 166056.0, "reward": 2.294389009475708, "reward_std": 1.4667046070098877, "rewards/rollout_reward_func/mean": 2.294389009475708, "rewards/rollout_reward_func/std": 2.705355644226074, "sampling/importance_sampling_ratio/max": 0.6386930346488953, "sampling/importance_sampling_ratio/mean": 0.3456364870071411, "sampling/importance_sampling_ratio/min": 2.5246762786400248e-23, "sampling/sampling_logp_difference/max": 17.197574615478516, "sampling/sampling_logp_difference/mean": 0.5085201263427734, "step": 7, "step_time": 8.68162968300203 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.00950168923009187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015183507581241429, "entropy": 2.5394680500030518, "epoch": 8e-05, "grad_norm": 0.1963687539100647, "kl": 0.004567280291666975, "learning_rate": 2.0000000000000003e-06, "loss": -0.0025, "step": 8, "step_time": 5.354600480001864 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.020438762847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03293876303359866, "completions/clipped_ratio": 0.03125, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 255.875, "completions/mean_terminated_length": 253.9677276611328, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.85464283823967, "epoch": 9e-05, "frac_reward_zero_std": 0.25, "grad_norm": 0.2474498748779297, "kl": 0.011498607171233743, "learning_rate": 2.285714285714286e-06, "loss": 0.0012, "num_tokens": 210796.0, "reward": 1.3338699340820312, "reward_std": 2.5887303352355957, "rewards/rollout_reward_func/mean": 1.3338699340820312, "rewards/rollout_reward_func/std": 3.360410213470459, "sampling/importance_sampling_ratio/max": 0.5211334228515625, "sampling/importance_sampling_ratio/mean": 0.22442704439163208, "sampling/importance_sampling_ratio/min": 3.1609963938632362e-21, "sampling/sampling_logp_difference/max": 17.6353759765625, "sampling/sampling_logp_difference/mean": 0.642817497253418, "step": 9, "step_time": 8.233338939999157 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.020438762847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03450126247480512, "entropy": 2.84877809882164, "epoch": 0.0001, "grad_norm": 0.26172828674316406, "kl": 0.015323846295359544, "learning_rate": 2.571428571428571e-06, "loss": 0.0012, "step": 10, "step_time": 4.531096401000468 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0329861119389534, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 141.90625, "completions/mean_terminated_length": 141.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.9380592703819275, "epoch": 0.00011, "frac_reward_zero_std": 0.5, "grad_norm": 1.6061288118362427, "kl": 0.014118633189355023, "learning_rate": 2.8571428571428573e-06, "loss": 0.0003, "num_tokens": 248797.0, "reward": 1.936483383178711, "reward_std": 1.4005991220474243, "rewards/rollout_reward_func/mean": 1.936483383178711, "rewards/rollout_reward_func/std": 2.826160430908203, "sampling/importance_sampling_ratio/max": 0.6114739775657654, "sampling/importance_sampling_ratio/mean": 0.34917908906936646, "sampling/importance_sampling_ratio/min": 2.6750652803997355e-09, "sampling/sampling_logp_difference/max": 7.856645584106445, "sampling/sampling_logp_difference/mean": 0.5155301690101624, "step": 11, "step_time": 7.723483315001431 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.02864583395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03432765230536461, "entropy": 2.9353168308734894, "epoch": 0.00012, "grad_norm": 4.647777557373047, "kl": 0.288594129786361, "learning_rate": 3.142857142857143e-06, "loss": 0.0006, "step": 12, "step_time": 5.336007644998972 }, { "clip_ratio/high_max": 0.08428030461072922, "clip_ratio/high_mean": 0.04214015230536461, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04782197065651417, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 240.5625, "completions/mean_terminated_length": 240.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.3039467334747314, "epoch": 0.00013, "frac_reward_zero_std": 0.0, "grad_norm": 1.4607264995574951, "kl": 0.16910503606777638, "learning_rate": 3.428571428571429e-06, "loss": 0.0001, "num_tokens": 293842.0, "reward": -1.709991455078125, "reward_std": 1.5302071571350098, "rewards/rollout_reward_func/mean": -1.709991455078125, "rewards/rollout_reward_func/std": 2.2777531147003174, "sampling/importance_sampling_ratio/max": 0.6255157589912415, "sampling/importance_sampling_ratio/mean": 0.19142957031726837, "sampling/importance_sampling_ratio/min": 0.00013171692262403667, "sampling/sampling_logp_difference/max": 4.141427040100098, "sampling/sampling_logp_difference/mean": 0.5723981857299805, "step": 13, "step_time": 8.410398983999585 }, { "clip_ratio/high_max": 0.05650252569466829, "clip_ratio/high_mean": 0.032157512847334146, "clip_ratio/low_mean": 0.03267045598477125, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.06482796976342797, "entropy": 3.3062852919101715, "epoch": 0.00014, "grad_norm": 8.715814590454102, "kl": 2.5666821942431852, "learning_rate": 3.7142857142857146e-06, "loss": 0.0045, "step": 14, "step_time": 4.670528444000411 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.043008208042010665, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.058633208042010665, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 245.4375, "completions/mean_terminated_length": 245.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.3065137565135956, "epoch": 0.00015, "frac_reward_zero_std": 0.0, "grad_norm": 0.2631007730960846, "kl": 0.025040931563125923, "learning_rate": 4.000000000000001e-06, "loss": -0.0103, "num_tokens": 339553.0, "reward": 0.159213587641716, "reward_std": 3.5501325130462646, "rewards/rollout_reward_func/mean": 0.159213587641716, "rewards/rollout_reward_func/std": 3.448413848876953, "sampling/importance_sampling_ratio/max": 0.5595780611038208, "sampling/importance_sampling_ratio/mean": 0.20568370819091797, "sampling/importance_sampling_ratio/min": 6.484561566191845e-20, "sampling/sampling_logp_difference/max": 16.15085792541504, "sampling/sampling_logp_difference/mean": 0.6963640451431274, "step": 15, "step_time": 8.541402874000596 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.024952652165666223, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04057765216566622, "entropy": 3.2842283844947815, "epoch": 0.00016, "grad_norm": 0.38864368200302124, "kl": 0.020869135376415215, "learning_rate": 4.2857142857142855e-06, "loss": -0.0102, "step": 16, "step_time": 5.71412646600038 }, { "clip_ratio/high_max": 0.04201388917863369, "clip_ratio/high_mean": 0.021006944589316845, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03767361165955663, "completions/clipped_ratio": 0.0, "completions/max_length": 508.0, "completions/max_terminated_length": 508.0, "completions/mean_length": 91.1875, "completions/mean_terminated_length": 91.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6407302916049957, "epoch": 0.00017, "frac_reward_zero_std": 0.25, "grad_norm": 0.21912191808223724, "kl": 0.08768526151106926, "learning_rate": 4.571428571428572e-06, "loss": -0.0096, "num_tokens": 376793.0, "reward": 1.5833828449249268, "reward_std": 2.425283908843994, "rewards/rollout_reward_func/mean": 1.5833828449249268, "rewards/rollout_reward_func/std": 3.3120193481445312, "sampling/importance_sampling_ratio/max": 0.5417768359184265, "sampling/importance_sampling_ratio/mean": 0.355357825756073, "sampling/importance_sampling_ratio/min": 1.233120490780161e-17, "sampling/sampling_logp_difference/max": 14.59448528289795, "sampling/sampling_logp_difference/mean": 0.6029778122901917, "step": 17, "step_time": 7.721674226999312 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020312500186264515, "entropy": 2.6432089805603027, "epoch": 0.00018, "grad_norm": 0.5367084741592407, "kl": 0.0504135433184274, "learning_rate": 4.857142857142858e-06, "loss": -0.0093, "step": 18, "step_time": 4.235890344997642 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 229.53125, "completions/mean_terminated_length": 229.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.859953433275223, "epoch": 0.00019, "frac_reward_zero_std": 0.25, "grad_norm": 1.1420725584030151, "kl": 0.011641762724138971, "learning_rate": 5.142857142857142e-06, "loss": 0.0022, "num_tokens": 420136.0, "reward": 1.3358067274093628, "reward_std": 1.6160221099853516, "rewards/rollout_reward_func/mean": 1.3358067274093628, "rewards/rollout_reward_func/std": 3.4285242557525635, "sampling/importance_sampling_ratio/max": 0.5588779449462891, "sampling/importance_sampling_ratio/mean": 0.28632479906082153, "sampling/importance_sampling_ratio/min": 0.0588185079395771, "sampling/sampling_logp_difference/max": 1.9835186004638672, "sampling/sampling_logp_difference/mean": 0.3730927109718323, "step": 19, "step_time": 8.547131662000538 }, { "clip_ratio/high_max": 0.0625, "clip_ratio/high_mean": 0.0451388880610466, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.055555555038154125, "entropy": 2.8539642095565796, "epoch": 0.0002, "grad_norm": 0.7655095458030701, "kl": 0.013166731794626685, "learning_rate": 5.428571428571429e-06, "loss": 0.0015, "step": 20, "step_time": 4.63187864900101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017881944542750716, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017881944542750716, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 247.5, "completions/mean_terminated_length": 247.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.632557690143585, "epoch": 0.00021, "frac_reward_zero_std": 0.0, "grad_norm": 0.5802043676376343, "kl": 0.007393030231469311, "learning_rate": 5.7142857142857145e-06, "loss": -0.009, "num_tokens": 463249.0, "reward": 1.260909914970398, "reward_std": 2.367619037628174, "rewards/rollout_reward_func/mean": 1.260909914970398, "rewards/rollout_reward_func/std": 2.983116865158081, "sampling/importance_sampling_ratio/max": 1.01347017288208, "sampling/importance_sampling_ratio/mean": 0.3541862666606903, "sampling/importance_sampling_ratio/min": 0.004073122050613165, "sampling/sampling_logp_difference/max": 2.2591404914855957, "sampling/sampling_logp_difference/mean": 0.3468233644962311, "step": 21, "step_time": 9.43255045800106 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.008806818397715688, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014488636748865247, "entropy": 2.6239876449108124, "epoch": 0.00022, "grad_norm": 0.45592379570007324, "kl": 0.00711311531267711, "learning_rate": 6e-06, "loss": -0.0092, "step": 22, "step_time": 4.566415036999388 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.02864583395421505, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.03432765230536461, "completions/clipped_ratio": 0.0, "completions/max_length": 566.0, "completions/max_terminated_length": 566.0, "completions/mean_length": 148.125, "completions/mean_terminated_length": 148.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.236017107963562, "epoch": 0.00023, "frac_reward_zero_std": 0.25, "grad_norm": 0.4092108905315399, "kl": 0.03667866565956501, "learning_rate": 6.285714285714286e-06, "loss": -0.0035, "num_tokens": 503126.0, "reward": 0.3271593451499939, "reward_std": 2.437808036804199, "rewards/rollout_reward_func/mean": 0.3271593451499939, "rewards/rollout_reward_func/std": 3.4470105171203613, "sampling/importance_sampling_ratio/max": 0.714533269405365, "sampling/importance_sampling_ratio/mean": 0.27389177680015564, "sampling/importance_sampling_ratio/min": 0.03493156284093857, "sampling/sampling_logp_difference/max": 2.0482232570648193, "sampling/sampling_logp_difference/mean": 0.44892847537994385, "step": 23, "step_time": 7.909272117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.05121527845039964, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05121527845039964, "entropy": 3.2346018254756927, "epoch": 0.00024, "grad_norm": 0.15784446895122528, "kl": 0.02007588549167849, "learning_rate": 6.571428571428572e-06, "loss": -0.0035, "step": 24, "step_time": 4.378888097001436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 258.125, "completions/mean_terminated_length": 258.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.620179995894432, "epoch": 0.00025, "frac_reward_zero_std": 0.5, "grad_norm": 0.3508198857307434, "kl": 0.03346719349065097, "learning_rate": 6.857142857142858e-06, "loss": -0.0002, "num_tokens": 545454.0, "reward": 2.348546028137207, "reward_std": 1.439249873161316, "rewards/rollout_reward_func/mean": 2.348546028137207, "rewards/rollout_reward_func/std": 2.573024034500122, "sampling/importance_sampling_ratio/max": 0.5957672595977783, "sampling/importance_sampling_ratio/mean": 0.32801398634910583, "sampling/importance_sampling_ratio/min": 0.009621978737413883, "sampling/sampling_logp_difference/max": 2.195632219314575, "sampling/sampling_logp_difference/mean": 0.36713382601737976, "step": 25, "step_time": 9.496772681000039 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 2.6226498037576675, "epoch": 0.00026, "grad_norm": 0.8960912227630615, "kl": 0.11110989438566321, "learning_rate": 7.1428571428571436e-06, "loss": -0.0, "step": 26, "step_time": 4.65142993200061 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.016098485328257084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0285984855145216, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 240.28125, "completions/mean_terminated_length": 240.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.791462779045105, "epoch": 0.00027, "frac_reward_zero_std": 0.5, "grad_norm": 0.1981482356786728, "kl": 0.08286742136260727, "learning_rate": 7.428571428571429e-06, "loss": -0.0088, "num_tokens": 587865.0, "reward": 2.199970245361328, "reward_std": 1.4416431188583374, "rewards/rollout_reward_func/mean": 2.199970245361328, "rewards/rollout_reward_func/std": 2.7269999980926514, "sampling/importance_sampling_ratio/max": 0.5932878851890564, "sampling/importance_sampling_ratio/mean": 0.3510062098503113, "sampling/importance_sampling_ratio/min": 0.014670351520180702, "sampling/sampling_logp_difference/max": 2.3246707916259766, "sampling/sampling_logp_difference/mean": 0.36131221055984497, "step": 27, "step_time": 8.463344640999821 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 2.7934266924858093, "epoch": 0.00028, "grad_norm": 0.25135743618011475, "kl": 0.08460524115071166, "learning_rate": 7.714285714285716e-06, "loss": -0.0087, "step": 28, "step_time": 4.655046421999032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 190.40625, "completions/mean_terminated_length": 196.03225708007812, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5830374658107758, "epoch": 0.00029, "frac_reward_zero_std": 0.0, "grad_norm": 0.42510339617729187, "kl": 0.056546261126641184, "learning_rate": 8.000000000000001e-06, "loss": -0.0015, "num_tokens": 630168.0, "reward": 0.955860435962677, "reward_std": 2.9645724296569824, "rewards/rollout_reward_func/mean": 0.955860435962677, "rewards/rollout_reward_func/std": 3.2620840072631836, "sampling/importance_sampling_ratio/max": 0.5791006684303284, "sampling/importance_sampling_ratio/mean": 0.3149833083152771, "sampling/importance_sampling_ratio/min": 0.003035087836906314, "sampling/sampling_logp_difference/max": 1.9127482175827026, "sampling/sampling_logp_difference/mean": 0.33079004287719727, "step": 29, "step_time": 9.575771130997964 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 2.584301859140396, "epoch": 0.0003, "grad_norm": 0.23514825105667114, "kl": 0.05904051155084744, "learning_rate": 8.285714285714287e-06, "loss": -0.0019, "step": 30, "step_time": 4.642899193998346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 225.46875, "completions/mean_terminated_length": 220.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.97169691324234, "epoch": 0.00031, "frac_reward_zero_std": 0.0, "grad_norm": 0.2808249294757843, "kl": 0.015363822691142559, "learning_rate": 8.571428571428571e-06, "loss": -0.0021, "num_tokens": 675360.0, "reward": 0.39564892649650574, "reward_std": 2.7179932594299316, "rewards/rollout_reward_func/mean": 0.39564892649650574, "rewards/rollout_reward_func/std": 3.2776944637298584, "sampling/importance_sampling_ratio/max": 0.6269606947898865, "sampling/importance_sampling_ratio/mean": 0.2631545066833496, "sampling/importance_sampling_ratio/min": 0.041307929903268814, "sampling/sampling_logp_difference/max": 1.6358973979949951, "sampling/sampling_logp_difference/mean": 0.3633136749267578, "step": 31, "step_time": 8.169762125001398 }, { "clip_ratio/high_max": 0.021780303679406643, "clip_ratio/high_mean": 0.010890151839703321, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029119318816810846, "entropy": 2.9928270876407623, "epoch": 0.00032, "grad_norm": 0.25205162167549133, "kl": 0.01888048052205704, "learning_rate": 8.857142857142858e-06, "loss": -0.0023, "step": 32, "step_time": 4.5527972239997325 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.03219697065651417, "completions/clipped_ratio": 0.0, "completions/max_length": 624.0, "completions/max_terminated_length": 624.0, "completions/mean_length": 266.59375, "completions/mean_terminated_length": 266.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.95265731215477, "epoch": 0.00033, "frac_reward_zero_std": 0.0, "grad_norm": 0.3819553852081299, "kl": 0.03062636696267873, "learning_rate": 9.142857142857144e-06, "loss": -0.0024, "num_tokens": 722191.0, "reward": 0.14489834010601044, "reward_std": 3.0729634761810303, "rewards/rollout_reward_func/mean": 0.14489834010601044, "rewards/rollout_reward_func/std": 3.0972211360931396, "sampling/importance_sampling_ratio/max": 0.5641534328460693, "sampling/importance_sampling_ratio/mean": 0.28035426139831543, "sampling/importance_sampling_ratio/min": 0.007912340573966503, "sampling/sampling_logp_difference/max": 1.8581523895263672, "sampling/sampling_logp_difference/mean": 0.3853089213371277, "step": 33, "step_time": 8.661423679998734 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.040798612870275974, "clip_ratio/low_min": 0.0243055559694767, "clip_ratio/region_mean": 0.04648043122142553, "entropy": 2.9283075034618378, "epoch": 0.00034, "grad_norm": 0.20521913468837738, "kl": 0.0580187423620373, "learning_rate": 9.42857142857143e-06, "loss": -0.0029, "step": 34, "step_time": 5.018625251998856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 118.375, "completions/mean_terminated_length": 118.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5763224959373474, "epoch": 0.00035, "frac_reward_zero_std": 0.25, "grad_norm": 0.6933721899986267, "kl": 0.10632833879208192, "learning_rate": 9.714285714285715e-06, "loss": -0.0077, "num_tokens": 760474.0, "reward": 1.4863779544830322, "reward_std": 1.5287230014801025, "rewards/rollout_reward_func/mean": 1.4863779544830322, "rewards/rollout_reward_func/std": 3.2098679542541504, "sampling/importance_sampling_ratio/max": 0.6021684408187866, "sampling/importance_sampling_ratio/mean": 0.37816816568374634, "sampling/importance_sampling_ratio/min": 0.013056308962404728, "sampling/sampling_logp_difference/max": 2.7211084365844727, "sampling/sampling_logp_difference/mean": 0.353171169757843, "step": 35, "step_time": 7.753367577999597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.024479167070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024479167070239782, "entropy": 2.5670806169509888, "epoch": 0.00036, "grad_norm": 0.20287086069583893, "kl": 0.12393587516271509, "learning_rate": 1e-05, "loss": -0.008, "step": 36, "step_time": 4.163353227998414 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 544.0, "completions/max_terminated_length": 544.0, "completions/mean_length": 182.5, "completions/mean_terminated_length": 171.61289978027344, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.776189923286438, "epoch": 0.00037, "frac_reward_zero_std": 0.25, "grad_norm": 0.718150794506073, "kl": 0.27011112417676486, "learning_rate": 9.999999999884322e-06, "loss": -0.0095, "num_tokens": 802561.0, "reward": 0.6463991403579712, "reward_std": 2.35530948638916, "rewards/rollout_reward_func/mean": 0.6463991403579712, "rewards/rollout_reward_func/std": 3.4667346477508545, "sampling/importance_sampling_ratio/max": 0.5866615772247314, "sampling/importance_sampling_ratio/mean": 0.3335971236228943, "sampling/importance_sampling_ratio/min": 0.00021820227266289294, "sampling/sampling_logp_difference/max": 4.184645652770996, "sampling/sampling_logp_difference/mean": 0.36202263832092285, "step": 37, "step_time": 8.146357539000746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019196428591385484, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019196428591385484, "entropy": 2.7618333101272583, "epoch": 0.00038, "grad_norm": 1.0092581510543823, "kl": 0.5067894529784098, "learning_rate": 9.999999999537282e-06, "loss": -0.0098, "step": 38, "step_time": 5.406113477999497 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019744318444281816, "completions/clipped_ratio": 0.03125, "completions/max_length": 744.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 310.4375, "completions/mean_terminated_length": 296.45159912109375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7289285361766815, "epoch": 0.00039, "frac_reward_zero_std": 0.25, "grad_norm": 0.21928273141384125, "kl": 0.03718407082487829, "learning_rate": 9.999999998958884e-06, "loss": -0.0051, "num_tokens": 849978.0, "reward": 2.7962937355041504, "reward_std": 2.092611312866211, "rewards/rollout_reward_func/mean": 2.7962937355041504, "rewards/rollout_reward_func/std": 2.5971806049346924, "sampling/importance_sampling_ratio/max": 0.6065940856933594, "sampling/importance_sampling_ratio/mean": 0.2727394998073578, "sampling/importance_sampling_ratio/min": 0.017602022737264633, "sampling/sampling_logp_difference/max": 1.9360582828521729, "sampling/sampling_logp_difference/mean": 0.3324418067932129, "step": 39, "step_time": 8.541660562999823 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.021875000093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027556818444281816, "entropy": 2.729422777891159, "epoch": 0.0004, "grad_norm": 0.2590782344341278, "kl": 0.07293493900215253, "learning_rate": 9.999999998149125e-06, "loss": -0.0049, "step": 40, "step_time": 4.732443049002541 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0243055559694767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029513888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 111.34375, "completions/mean_terminated_length": 111.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3829638957977295, "epoch": 0.00041, "frac_reward_zero_std": 0.25, "grad_norm": 1.3550938367843628, "kl": 0.2143448798742611, "learning_rate": 9.99999999710801e-06, "loss": -0.0025, "num_tokens": 888016.0, "reward": 0.23336726427078247, "reward_std": 1.6899447441101074, "rewards/rollout_reward_func/mean": 0.23336726427078247, "rewards/rollout_reward_func/std": 3.296257734298706, "sampling/importance_sampling_ratio/max": 0.6236161589622498, "sampling/importance_sampling_ratio/mean": 0.42243075370788574, "sampling/importance_sampling_ratio/min": 0.008544650860130787, "sampling/sampling_logp_difference/max": 2.514310359954834, "sampling/sampling_logp_difference/mean": 0.30278927087783813, "step": 41, "step_time": 7.9589745689991105 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.017361111473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022569444496184587, "entropy": 2.3600710332393646, "epoch": 0.00042, "grad_norm": 0.7117645740509033, "kl": 0.22721410644589923, "learning_rate": 9.999999995835533e-06, "loss": -0.0027, "step": 42, "step_time": 5.202910259999953 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.008658008649945259, "clip_ratio/low_min": 0.0059523810632526875, "clip_ratio/region_mean": 0.021158008836209774, "completions/clipped_ratio": 0.03125, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 284.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6091897934675217, "epoch": 0.00043, "frac_reward_zero_std": 0.25, "grad_norm": 8.355428695678711, "kl": 1.349946062779054, "learning_rate": 9.999999994331697e-06, "loss": -0.0014, "num_tokens": 932360.0, "reward": 1.9460515975952148, "reward_std": 2.556663751602173, "rewards/rollout_reward_func/mean": 1.9460515975952148, "rewards/rollout_reward_func/std": 3.0885021686553955, "sampling/importance_sampling_ratio/max": 0.5687183141708374, "sampling/importance_sampling_ratio/mean": 0.27799713611602783, "sampling/importance_sampling_ratio/min": 0.014925064519047737, "sampling/sampling_logp_difference/max": 2.6669070720672607, "sampling/sampling_logp_difference/mean": 0.33453667163848877, "step": 43, "step_time": 8.775682885001515 }, { "clip_ratio/high_max": 0.05025252606719732, "clip_ratio/high_mean": 0.02512626303359866, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02512626303359866, "entropy": 2.6154074668884277, "epoch": 0.00044, "grad_norm": 0.3274429142475128, "kl": 0.09164868574589491, "learning_rate": 9.999999992596503e-06, "loss": -0.0094, "step": 44, "step_time": 4.680068101999495 }, { "clip_ratio/high_max": 0.034722222946584225, "clip_ratio/high_mean": 0.017361111473292112, "clip_ratio/low_mean": 0.065798613242805, "clip_ratio/low_min": 0.033333334140479565, "clip_ratio/region_mean": 0.0831597251817584, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 300.0625, "completions/mean_terminated_length": 300.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 3.3571303486824036, "epoch": 0.00045, "frac_reward_zero_std": 0.0, "grad_norm": 0.3407910168170929, "kl": 0.10123429959639907, "learning_rate": 9.999999990629948e-06, "loss": -0.0064, "num_tokens": 980760.0, "reward": 0.783581018447876, "reward_std": 3.526195526123047, "rewards/rollout_reward_func/mean": 0.783581018447876, "rewards/rollout_reward_func/std": 3.3680193424224854, "sampling/importance_sampling_ratio/max": 0.5847072601318359, "sampling/importance_sampling_ratio/mean": 0.24667662382125854, "sampling/importance_sampling_ratio/min": 1.7736905694176741e-12, "sampling/sampling_logp_difference/max": 10.852811813354492, "sampling/sampling_logp_difference/mean": 0.5838006138801575, "step": 45, "step_time": 8.772520229999827 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0437500006519258, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.05069444514811039, "entropy": 3.354842036962509, "epoch": 0.00046, "grad_norm": 0.44516900181770325, "kl": 0.1013536008540541, "learning_rate": 9.999999988432035e-06, "loss": -0.0063, "step": 46, "step_time": 5.730669249001039 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016741071827709675, "completions/clipped_ratio": 0.03125, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 102.9375, "completions/mean_terminated_length": 91.45160675048828, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.1211505830287933, "epoch": 0.00047, "frac_reward_zero_std": 0.5, "grad_norm": 0.12849338352680206, "kl": 0.07529694779077545, "learning_rate": 9.999999986002761e-06, "loss": -0.006, "num_tokens": 1016899.0, "reward": 3.4929916858673096, "reward_std": 0.8375400304794312, "rewards/rollout_reward_func/mean": 3.4929916858673096, "rewards/rollout_reward_func/std": 1.7731660604476929, "sampling/importance_sampling_ratio/max": 1.0617642402648926, "sampling/importance_sampling_ratio/mean": 0.47195756435394287, "sampling/importance_sampling_ratio/min": 8.988847316404442e-21, "sampling/sampling_logp_difference/max": 18.769134521484375, "sampling/sampling_logp_difference/mean": 0.4733576476573944, "step": 47, "step_time": 8.076779775000432 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 2.0891966968774796, "epoch": 0.00048, "grad_norm": 0.1472642868757248, "kl": 0.06522011535707861, "learning_rate": 9.999999983342127e-06, "loss": -0.0063, "step": 48, "step_time": 4.366419464000501 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.03496212186291814, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04746212204918265, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 402.875, "completions/mean_terminated_length": 402.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.454709380865097, "epoch": 0.00049, "frac_reward_zero_std": 0.0, "grad_norm": 0.20916694402694702, "kl": 0.13525894307531416, "learning_rate": 9.999999980450137e-06, "loss": -0.0049, "num_tokens": 1069017.0, "reward": 0.33687102794647217, "reward_std": 2.6168036460876465, "rewards/rollout_reward_func/mean": 0.33687102794647217, "rewards/rollout_reward_func/std": 2.9823973178863525, "sampling/importance_sampling_ratio/max": 0.47108548879623413, "sampling/importance_sampling_ratio/mean": 0.23205065727233887, "sampling/importance_sampling_ratio/min": 8.144973447842077e-22, "sampling/sampling_logp_difference/max": 20.414316177368164, "sampling/sampling_logp_difference/mean": 0.8938305974006653, "step": 49, "step_time": 8.759724919999826 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.009999999776482582, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022499999962747097, "entropy": 2.4418140053749084, "epoch": 0.0005, "grad_norm": 0.20883126556873322, "kl": 0.10223897837568074, "learning_rate": 9.999999977326787e-06, "loss": -0.0052, "step": 50, "step_time": 5.079798426000707 }, { "clip_ratio/high_max": 0.06360479909926653, "clip_ratio/high_mean": 0.031802399549633265, "clip_ratio/low_mean": 0.02291666716337204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.054719066713005304, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 304.1875, "completions/mean_terminated_length": 304.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 3.0433402955532074, "epoch": 0.00051, "frac_reward_zero_std": 0.0, "grad_norm": 0.19756755232810974, "kl": 0.128817240241915, "learning_rate": 9.999999973972076e-06, "loss": 0.0002, "num_tokens": 1116690.0, "reward": 0.6315986514091492, "reward_std": 3.0274603366851807, "rewards/rollout_reward_func/mean": 0.6315986514091492, "rewards/rollout_reward_func/std": 2.993239164352417, "sampling/importance_sampling_ratio/max": 0.6633039712905884, "sampling/importance_sampling_ratio/mean": 0.3017253577709198, "sampling/importance_sampling_ratio/min": 9.948342899636142e-21, "sampling/sampling_logp_difference/max": 14.285256385803223, "sampling/sampling_logp_difference/mean": 0.5861150622367859, "step": 51, "step_time": 9.117175093001606 }, { "clip_ratio/high_max": 0.06613005138933659, "clip_ratio/high_mean": 0.03306502569466829, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.049731692764908075, "entropy": 3.0335496366024017, "epoch": 0.00052, "grad_norm": 0.18290506303310394, "kl": 0.12607903964817524, "learning_rate": 9.999999970386004e-06, "loss": 0.0003, "step": 52, "step_time": 4.663150890999532 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125000046566129, "completions/clipped_ratio": 0.03125, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 282.09375, "completions/mean_terminated_length": 278.258056640625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4818911105394363, "epoch": 0.00053, "frac_reward_zero_std": 0.25, "grad_norm": 0.6424238681793213, "kl": 0.4507314027287066, "learning_rate": 9.999999966568576e-06, "loss": 0.0043, "num_tokens": 1162473.0, "reward": 1.8163468837738037, "reward_std": 2.4809772968292236, "rewards/rollout_reward_func/mean": 1.8163468837738037, "rewards/rollout_reward_func/std": 3.2278072834014893, "sampling/importance_sampling_ratio/max": 0.6591382622718811, "sampling/importance_sampling_ratio/mean": 0.34810492396354675, "sampling/importance_sampling_ratio/min": 0.0027664124500006437, "sampling/sampling_logp_difference/max": 3.3028693199157715, "sampling/sampling_logp_difference/mean": 0.3139556050300598, "step": 53, "step_time": 8.737825266000982 }, { "clip_ratio/high_max": 0.03636363707482815, "clip_ratio/high_mean": 0.018181818537414074, "clip_ratio/low_mean": 0.025000000838190317, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043181818909943104, "entropy": 2.5050487220287323, "epoch": 0.00054, "grad_norm": 0.6114789247512817, "kl": 0.3469981327652931, "learning_rate": 9.999999962519787e-06, "loss": 0.0039, "step": 54, "step_time": 4.664913984997838 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.03281250037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0384943182580173, "completions/clipped_ratio": 0.03125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 359.625, "completions/mean_terminated_length": 359.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8838580548763275, "epoch": 0.00055, "frac_reward_zero_std": 0.0, "grad_norm": 0.3690301477909088, "kl": 0.21350001264363527, "learning_rate": 9.999999958239642e-06, "loss": -0.02, "num_tokens": 1213199.0, "reward": -0.5511295199394226, "reward_std": 2.717623710632324, "rewards/rollout_reward_func/mean": -0.5511295199394226, "rewards/rollout_reward_func/std": 2.772667646408081, "sampling/importance_sampling_ratio/max": 0.5956400036811829, "sampling/importance_sampling_ratio/mean": 0.2782037556171417, "sampling/importance_sampling_ratio/min": 1.7953210473820036e-08, "sampling/sampling_logp_difference/max": 7.516845226287842, "sampling/sampling_logp_difference/mean": 0.4300991892814636, "step": 55, "step_time": 9.525844230000075 }, { "clip_ratio/high_max": 0.030952381435781717, "clip_ratio/high_mean": 0.015476190717890859, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04047619178891182, "entropy": 2.8672271072864532, "epoch": 0.00056, "grad_norm": 0.35541823506355286, "kl": 0.17689757759217173, "learning_rate": 9.999999953728133e-06, "loss": -0.0203, "step": 56, "step_time": 4.582618672000535 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02130681835114956, "completions/clipped_ratio": 0.0625, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 220.125, "completions/mean_terminated_length": 233.7333526611328, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4740554839372635, "epoch": 0.00057, "frac_reward_zero_std": 0.0, "grad_norm": 5.757898807525635, "kl": 0.3704188368283212, "learning_rate": 9.999999948985266e-06, "loss": -0.0175, "num_tokens": 1256493.0, "reward": -0.5809816122055054, "reward_std": 1.5138485431671143, "rewards/rollout_reward_func/mean": -0.5809816122055054, "rewards/rollout_reward_func/std": 2.764249324798584, "sampling/importance_sampling_ratio/max": 0.690333902835846, "sampling/importance_sampling_ratio/mean": 0.3601253032684326, "sampling/importance_sampling_ratio/min": 0.005966894794255495, "sampling/sampling_logp_difference/max": 2.0939223766326904, "sampling/sampling_logp_difference/mean": 0.29007619619369507, "step": 57, "step_time": 8.427904373000274 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.017045455053448677, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027462122030556202, "entropy": 2.466378837823868, "epoch": 0.00058, "grad_norm": 0.308349072933197, "kl": 0.12425832008011639, "learning_rate": 9.99999994401104e-06, "loss": -0.02, "step": 58, "step_time": 4.549335121998411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021158009069040418, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021158009069040418, "completions/clipped_ratio": 0.03125, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 252.0, "completions/mean_terminated_length": 246.6774139404297, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.571542501449585, "epoch": 0.00059, "frac_reward_zero_std": 0.25, "grad_norm": 0.24225421249866486, "kl": 0.30457523884251714, "learning_rate": 9.999999938805455e-06, "loss": -0.0032, "num_tokens": 1300039.0, "reward": 1.8227726221084595, "reward_std": 2.3441781997680664, "rewards/rollout_reward_func/mean": 1.8227726221084595, "rewards/rollout_reward_func/std": 2.9741404056549072, "sampling/importance_sampling_ratio/max": 0.6873592138290405, "sampling/importance_sampling_ratio/mean": 0.39568793773651123, "sampling/importance_sampling_ratio/min": 6.846766249593829e-19, "sampling/sampling_logp_difference/max": 17.39272117614746, "sampling/sampling_logp_difference/mean": 0.5334513187408447, "step": 59, "step_time": 9.53515045200038 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018181818537414074, "entropy": 2.556135505437851, "epoch": 0.0006, "grad_norm": 0.29070553183555603, "kl": 0.2604641974903643, "learning_rate": 9.999999933368511e-06, "loss": -0.0031, "step": 60, "step_time": 4.740452592000111 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027083334047347307, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 404.59375, "completions/mean_terminated_length": 404.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.8183280527591705, "epoch": 0.00061, "frac_reward_zero_std": 0.0, "grad_norm": 0.49202990531921387, "kl": 0.41977154929190874, "learning_rate": 9.999999927700208e-06, "loss": -0.0126, "num_tokens": 1352002.0, "reward": -0.19866448640823364, "reward_std": 2.5248358249664307, "rewards/rollout_reward_func/mean": -0.19866448640823364, "rewards/rollout_reward_func/std": 2.8145956993103027, "sampling/importance_sampling_ratio/max": 0.6076127290725708, "sampling/importance_sampling_ratio/mean": 0.28508585691452026, "sampling/importance_sampling_ratio/min": 0.01555725745856762, "sampling/sampling_logp_difference/max": 2.166548490524292, "sampling/sampling_logp_difference/mean": 0.3595178723335266, "step": 61, "step_time": 8.632366242000899 }, { "clip_ratio/high_max": 0.049116162583231926, "clip_ratio/high_mean": 0.024558081291615963, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03237058129161596, "entropy": 2.777392953634262, "epoch": 0.00062, "grad_norm": 0.3118762969970703, "kl": 0.37724288646131754, "learning_rate": 9.999999921800544e-06, "loss": -0.0127, "step": 62, "step_time": 4.666704722999384 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012626262847334146, "completions/clipped_ratio": 0.03125, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 275.5, "completions/mean_terminated_length": 269.58062744140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.336562752723694, "epoch": 0.00063, "frac_reward_zero_std": 0.25, "grad_norm": 0.7082567811012268, "kl": 0.1581236650235951, "learning_rate": 9.999999915669521e-06, "loss": -0.0148, "num_tokens": 1396655.0, "reward": 0.8462115526199341, "reward_std": 1.8160178661346436, "rewards/rollout_reward_func/mean": 0.8462115526199341, "rewards/rollout_reward_func/std": 3.1953089237213135, "sampling/importance_sampling_ratio/max": 0.6552026271820068, "sampling/importance_sampling_ratio/mean": 0.34656041860580444, "sampling/importance_sampling_ratio/min": 0.0029228406492620707, "sampling/sampling_logp_difference/max": 5.2165207862854, "sampling/sampling_logp_difference/mean": 0.3104858994483948, "step": 63, "step_time": 9.099256475000402 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019570707343518734, "entropy": 2.3187194764614105, "epoch": 0.00064, "grad_norm": 0.14218802750110626, "kl": 0.1248642154969275, "learning_rate": 9.99999990930714e-06, "loss": -0.0154, "step": 64, "step_time": 5.058028757999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.024479167070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024479167070239782, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 216.15625, "completions/mean_terminated_length": 216.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.531227856874466, "epoch": 0.00065, "frac_reward_zero_std": 0.0, "grad_norm": 1.585115909576416, "kl": 0.3141834079287946, "learning_rate": 9.999999902713398e-06, "loss": -0.0032, "num_tokens": 1441165.0, "reward": -0.27969300746917725, "reward_std": 2.9374101161956787, "rewards/rollout_reward_func/mean": -0.27969300746917725, "rewards/rollout_reward_func/std": 2.9488377571105957, "sampling/importance_sampling_ratio/max": 0.840031623840332, "sampling/importance_sampling_ratio/mean": 0.40682923793792725, "sampling/importance_sampling_ratio/min": 0.012160572223365307, "sampling/sampling_logp_difference/max": 1.9474881887435913, "sampling/sampling_logp_difference/mean": 0.3118096590042114, "step": 65, "step_time": 8.323025914001846 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.046875000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0572916679084301, "entropy": 2.5163692831993103, "epoch": 0.00066, "grad_norm": 19.35067367553711, "kl": 5.455378066282719, "learning_rate": 9.999999895888298e-06, "loss": 0.0057, "step": 66, "step_time": 4.707739549000507 }, { "clip_ratio/high_max": 0.026988636702299118, "clip_ratio/high_mean": 0.019176136702299118, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026120580732822418, "completions/clipped_ratio": 0.03125, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 315.25, "completions/mean_terminated_length": 312.6451416015625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6580385267734528, "epoch": 0.00067, "frac_reward_zero_std": 0.0, "grad_norm": 0.846753716468811, "kl": 0.23914973251521587, "learning_rate": 9.99999988883184e-06, "loss": -0.0294, "num_tokens": 1489449.0, "reward": -0.017722517251968384, "reward_std": 2.588423728942871, "rewards/rollout_reward_func/mean": -0.017722517251968384, "rewards/rollout_reward_func/std": 3.315229654312134, "sampling/importance_sampling_ratio/max": 1.53274667263031, "sampling/importance_sampling_ratio/mean": 0.30872246623039246, "sampling/importance_sampling_ratio/min": 0.008587171323597431, "sampling/sampling_logp_difference/max": 2.660094738006592, "sampling/sampling_logp_difference/mean": 0.35753098130226135, "step": 67, "step_time": 9.066220790001353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.639769285917282, "epoch": 0.00068, "grad_norm": 0.9876938462257385, "kl": 0.21986476704478264, "learning_rate": 9.999999881544019e-06, "loss": -0.0302, "step": 68, "step_time": 4.947790696000084 }, { "clip_ratio/high_max": 0.03914141468703747, "clip_ratio/high_mean": 0.019570707343518734, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019570707343518734, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.5055821537971497, "epoch": 0.00069, "frac_reward_zero_std": 0.5, "grad_norm": 1.7208331823349, "kl": 0.24184659007005394, "learning_rate": 9.999999874024841e-06, "loss": -0.0126, "num_tokens": 1526203.0, "reward": 2.2172718048095703, "reward_std": 1.8295981884002686, "rewards/rollout_reward_func/mean": 2.2172718048095703, "rewards/rollout_reward_func/std": 3.0762734413146973, "sampling/importance_sampling_ratio/max": 0.7073167562484741, "sampling/importance_sampling_ratio/mean": 0.42706984281539917, "sampling/importance_sampling_ratio/min": 0.017111409455537796, "sampling/sampling_logp_difference/max": 1.8124048709869385, "sampling/sampling_logp_difference/mean": 0.32563912868499756, "step": 69, "step_time": 7.817175942999711 }, { "clip_ratio/high_max": 0.038194444961845875, "clip_ratio/high_mean": 0.019097222480922937, "clip_ratio/low_mean": 0.02777777798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04687500046566129, "entropy": 2.5096128582954407, "epoch": 0.0007, "grad_norm": 2.205251693725586, "kl": 1.2448946423828602, "learning_rate": 9.999999866274303e-06, "loss": -0.0112, "step": 70, "step_time": 4.2832416039991585 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021006944589316845, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 271.96875, "completions/mean_terminated_length": 271.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.262143462896347, "epoch": 0.00071, "frac_reward_zero_std": 0.0, "grad_norm": 0.40326690673828125, "kl": 0.3753017848357558, "learning_rate": 9.999999858292407e-06, "loss": -0.0267, "num_tokens": 1570302.0, "reward": 0.9290536642074585, "reward_std": 2.644033432006836, "rewards/rollout_reward_func/mean": 0.9290536642074585, "rewards/rollout_reward_func/std": 3.344982624053955, "sampling/importance_sampling_ratio/max": 0.6997819542884827, "sampling/importance_sampling_ratio/mean": 0.39940544962882996, "sampling/importance_sampling_ratio/min": 0.05729014053940773, "sampling/sampling_logp_difference/max": 1.580177664756775, "sampling/sampling_logp_difference/mean": 0.2524714767932892, "step": 71, "step_time": 8.841797711000254 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0390625, "entropy": 2.284905269742012, "epoch": 0.00072, "grad_norm": 6.859172344207764, "kl": 1.263947894796729, "learning_rate": 9.99999985007915e-06, "loss": -0.0253, "step": 72, "step_time": 4.541057797000576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 404.46875, "completions/mean_terminated_length": 404.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.51061849296093, "epoch": 0.00073, "frac_reward_zero_std": 0.0, "grad_norm": 2.302504062652588, "kl": 0.9222528701648116, "learning_rate": 9.999999841634535e-06, "loss": -0.0082, "num_tokens": 1621727.0, "reward": -0.7880896329879761, "reward_std": 2.0208892822265625, "rewards/rollout_reward_func/mean": -0.7880896329879761, "rewards/rollout_reward_func/std": 2.709735631942749, "sampling/importance_sampling_ratio/max": 0.8968761563301086, "sampling/importance_sampling_ratio/mean": 0.34243810176849365, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.634542942047119, "sampling/sampling_logp_difference/mean": 0.32848647236824036, "step": 73, "step_time": 9.10820267300096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.04010416753590107, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04010416753590107, "entropy": 2.563834100961685, "epoch": 0.00074, "grad_norm": 0.3644697666168213, "kl": 0.36825570929795504, "learning_rate": 9.99999983295856e-06, "loss": -0.0069, "step": 74, "step_time": 4.722849544998098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.038194444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.038194444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 103.40625, "completions/mean_terminated_length": 103.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.061522677540779, "epoch": 0.00075, "frac_reward_zero_std": 0.25, "grad_norm": 0.5606231093406677, "kl": 0.21017898945137858, "learning_rate": 9.999999824051225e-06, "loss": -0.0244, "num_tokens": 1658705.0, "reward": 1.1765587329864502, "reward_std": 2.501732587814331, "rewards/rollout_reward_func/mean": 1.1765587329864502, "rewards/rollout_reward_func/std": 3.3966777324676514, "sampling/importance_sampling_ratio/max": 0.7627377510070801, "sampling/importance_sampling_ratio/mean": 0.5141996145248413, "sampling/importance_sampling_ratio/min": 1.1713093828191745e-09, "sampling/sampling_logp_difference/max": 7.898372173309326, "sampling/sampling_logp_difference/mean": 0.33391663432121277, "step": 75, "step_time": 7.57792517300004 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.03472222248092294, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040404040832072496, "entropy": 2.060318410396576, "epoch": 0.00076, "grad_norm": 0.37338006496429443, "kl": 0.2524644515942782, "learning_rate": 9.999999814912531e-06, "loss": -0.0253, "step": 76, "step_time": 4.5355984410007295 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 194.28125, "completions/mean_terminated_length": 194.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.94745434820652, "epoch": 0.00077, "frac_reward_zero_std": 0.5, "grad_norm": 0.3445930778980255, "kl": 0.16736432584002614, "learning_rate": 9.999999805542478e-06, "loss": -0.0183, "num_tokens": 1699914.0, "reward": 3.4025425910949707, "reward_std": 0.9160298705101013, "rewards/rollout_reward_func/mean": 3.4025425910949707, "rewards/rollout_reward_func/std": 1.869390845298767, "sampling/importance_sampling_ratio/max": 0.813917338848114, "sampling/importance_sampling_ratio/mean": 0.468423068523407, "sampling/importance_sampling_ratio/min": 6.12808436536171e-21, "sampling/sampling_logp_difference/max": 16.913782119750977, "sampling/sampling_logp_difference/mean": 0.5571764707565308, "step": 77, "step_time": 8.913381650001611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.92782723903656, "epoch": 0.00078, "grad_norm": 0.3132149279117584, "kl": 0.23644063854590058, "learning_rate": 9.999999795941065e-06, "loss": -0.0185, "step": 78, "step_time": 4.397163133999129 }, { "clip_ratio/high_max": 0.0367567571811378, "clip_ratio/high_mean": 0.0183783785905689, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024628378683701158, "completions/clipped_ratio": 0.03125, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 293.625, "completions/mean_terminated_length": 282.7419128417969, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1656928956508636, "epoch": 0.00079, "frac_reward_zero_std": 0.25, "grad_norm": 0.3871540129184723, "kl": 0.19812258332967758, "learning_rate": 9.999999786108293e-06, "loss": -0.0202, "num_tokens": 1746182.0, "reward": 2.1568219661712646, "reward_std": 2.4764516353607178, "rewards/rollout_reward_func/mean": 2.1568219661712646, "rewards/rollout_reward_func/std": 2.9280734062194824, "sampling/importance_sampling_ratio/max": 0.7869672179222107, "sampling/importance_sampling_ratio/mean": 0.39297473430633545, "sampling/importance_sampling_ratio/min": 3.628746532500212e-22, "sampling/sampling_logp_difference/max": 16.387094497680664, "sampling/sampling_logp_difference/mean": 0.5096595287322998, "step": 79, "step_time": 8.850599870999758 }, { "clip_ratio/high_max": 0.031013513915240765, "clip_ratio/high_mean": 0.015506756957620382, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028006757143884897, "entropy": 2.1341805458068848, "epoch": 0.0008, "grad_norm": 0.34065571427345276, "kl": 0.21595221292227507, "learning_rate": 9.999999776044163e-06, "loss": -0.0209, "step": 80, "step_time": 5.1028459619992645 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.016098485328257084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02178030274808407, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 206.1875, "completions/mean_terminated_length": 206.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.3100844621658325, "epoch": 0.00081, "frac_reward_zero_std": 0.0, "grad_norm": 5.163079738616943, "kl": 2.273834176827222, "learning_rate": 9.999999765748672e-06, "loss": -0.0283, "num_tokens": 1788720.0, "reward": 0.7703145742416382, "reward_std": 2.528787612915039, "rewards/rollout_reward_func/mean": 0.7703145742416382, "rewards/rollout_reward_func/std": 3.311854362487793, "sampling/importance_sampling_ratio/max": 0.7970731258392334, "sampling/importance_sampling_ratio/mean": 0.48718783259391785, "sampling/importance_sampling_ratio/min": 3.7321768852933194e-19, "sampling/sampling_logp_difference/max": 19.91360855102539, "sampling/sampling_logp_difference/mean": 0.6429722309112549, "step": 81, "step_time": 8.829804724000496 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017045455053448677, "entropy": 2.271873190999031, "epoch": 0.00082, "grad_norm": 1.1490485668182373, "kl": 0.5247112140059471, "learning_rate": 9.999999755221823e-06, "loss": -0.0316, "step": 82, "step_time": 4.621973430000253 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.012626262847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04387626377865672, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 241.34375, "completions/mean_terminated_length": 241.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9270499497652054, "epoch": 0.00083, "frac_reward_zero_std": 0.0, "grad_norm": 2.481043815612793, "kl": 0.26738595496863127, "learning_rate": 9.999999744463613e-06, "loss": 0.0046, "num_tokens": 1832903.0, "reward": 0.9745981097221375, "reward_std": 2.6684117317199707, "rewards/rollout_reward_func/mean": 0.9745981097221375, "rewards/rollout_reward_func/std": 3.3679678440093994, "sampling/importance_sampling_ratio/max": 1.1726807355880737, "sampling/importance_sampling_ratio/mean": 0.5075311064720154, "sampling/importance_sampling_ratio/min": 9.086892378145934e-15, "sampling/sampling_logp_difference/max": 12.919556617736816, "sampling/sampling_logp_difference/mean": 0.3794715702533722, "step": 83, "step_time": 8.310640733000582 }, { "clip_ratio/high_max": 0.06250000186264515, "clip_ratio/high_mean": 0.031250000931322575, "clip_ratio/low_mean": 0.018308081198483706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04955808212980628, "entropy": 1.882372260093689, "epoch": 0.00084, "grad_norm": 0.6479469537734985, "kl": 0.25546986144036055, "learning_rate": 9.999999733474045e-06, "loss": 0.0016, "step": 84, "step_time": 5.0099814949990105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 102.3125, "completions/mean_terminated_length": 105.09677124023438, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5515034198760986, "epoch": 0.00085, "frac_reward_zero_std": 0.5, "grad_norm": 0.5066191554069519, "kl": 0.1434833575040102, "learning_rate": 9.999999722253117e-06, "loss": -0.0128, "num_tokens": 1868222.0, "reward": 3.381326198577881, "reward_std": 1.202255368232727, "rewards/rollout_reward_func/mean": 3.381326198577881, "rewards/rollout_reward_func/std": 1.7988159656524658, "sampling/importance_sampling_ratio/max": 0.8244321346282959, "sampling/importance_sampling_ratio/mean": 0.6375912427902222, "sampling/importance_sampling_ratio/min": 0.038727276027202606, "sampling/sampling_logp_difference/max": 1.849066972732544, "sampling/sampling_logp_difference/mean": 0.16428890824317932, "step": 85, "step_time": 8.246966859000167 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02499086270108819, "entropy": 1.501015916466713, "epoch": 0.00086, "grad_norm": 0.14669431746006012, "kl": 0.14431945281103253, "learning_rate": 9.99999971080083e-06, "loss": -0.0131, "step": 86, "step_time": 5.146135714999218 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0447916672565043, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 189.28125, "completions/mean_terminated_length": 189.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6537679433822632, "epoch": 0.00087, "frac_reward_zero_std": 0.25, "grad_norm": 0.44590240716934204, "kl": 0.2705833809450269, "learning_rate": 9.999999699117184e-06, "loss": -0.0262, "num_tokens": 1908317.0, "reward": 3.2974443435668945, "reward_std": 2.1864736080169678, "rewards/rollout_reward_func/mean": 3.2974443435668945, "rewards/rollout_reward_func/std": 2.499469518661499, "sampling/importance_sampling_ratio/max": 0.83790123462677, "sampling/importance_sampling_ratio/mean": 0.5956872701644897, "sampling/importance_sampling_ratio/min": 0.032226502895355225, "sampling/sampling_logp_difference/max": 1.9425500631332397, "sampling/sampling_logp_difference/mean": 0.18642427027225494, "step": 87, "step_time": 8.465116195999144 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.022916667629033327, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029166667722165585, "entropy": 1.6013593226671219, "epoch": 0.00088, "grad_norm": 0.22049394249916077, "kl": 0.27147836051881313, "learning_rate": 9.999999687202177e-06, "loss": -0.0266, "step": 88, "step_time": 4.534178721000899 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 297.53125, "completions/mean_terminated_length": 297.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9525695890188217, "epoch": 0.00089, "frac_reward_zero_std": 0.0, "grad_norm": 0.6802153587341309, "kl": 0.3439987050369382, "learning_rate": 9.999999675055814e-06, "loss": -0.0138, "num_tokens": 1954905.0, "reward": -0.33461815118789673, "reward_std": 2.490633249282837, "rewards/rollout_reward_func/mean": -0.33461815118789673, "rewards/rollout_reward_func/std": 3.2068443298339844, "sampling/importance_sampling_ratio/max": 1.0638041496276855, "sampling/importance_sampling_ratio/mean": 0.5267090797424316, "sampling/importance_sampling_ratio/min": 0.04246438294649124, "sampling/sampling_logp_difference/max": 1.8757679462432861, "sampling/sampling_logp_difference/mean": 0.22254350781440735, "step": 89, "step_time": 9.143109937999725 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.02187499962747097, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.028124999720603228, "entropy": 1.8948117643594742, "epoch": 0.0009, "grad_norm": 1.0591293573379517, "kl": 0.3850090391933918, "learning_rate": 9.999999662678088e-06, "loss": -0.015, "step": 90, "step_time": 5.193775151000409 }, { "clip_ratio/high_max": 0.03948863688856363, "clip_ratio/high_mean": 0.019744318444281816, "clip_ratio/low_mean": 0.020312500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04005681909620762, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 327.59375, "completions/mean_terminated_length": 327.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.536841705441475, "epoch": 0.00091, "frac_reward_zero_std": 0.0, "grad_norm": 0.340347021818161, "kl": 0.2339587416499853, "learning_rate": 9.999999650069006e-06, "loss": -0.0013, "num_tokens": 2003411.0, "reward": 0.6218236684799194, "reward_std": 2.8276419639587402, "rewards/rollout_reward_func/mean": 0.6218236684799194, "rewards/rollout_reward_func/std": 3.405416488647461, "sampling/importance_sampling_ratio/max": 1.2139031887054443, "sampling/importance_sampling_ratio/mean": 0.5841308832168579, "sampling/importance_sampling_ratio/min": 0.15868307650089264, "sampling/sampling_logp_difference/max": 1.0458643436431885, "sampling/sampling_logp_difference/mean": 0.1608116328716278, "step": 91, "step_time": 8.54515753799933 }, { "clip_ratio/high_max": 0.06448863819241524, "clip_ratio/high_mean": 0.03224431909620762, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04630681872367859, "entropy": 1.5270984172821045, "epoch": 0.00092, "grad_norm": 0.7889478206634521, "kl": 0.243224008474499, "learning_rate": 9.999999637228563e-06, "loss": -0.0024, "step": 92, "step_time": 4.664439555000172 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 246.75, "completions/mean_terminated_length": 239.9677276611328, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4887060672044754, "epoch": 0.00093, "frac_reward_zero_std": 0.25, "grad_norm": 0.2640371024608612, "kl": 0.3053726674988866, "learning_rate": 9.99999962415676e-06, "loss": -0.0204, "num_tokens": 2048744.0, "reward": 1.967405080795288, "reward_std": 2.499349355697632, "rewards/rollout_reward_func/mean": 1.967405080795288, "rewards/rollout_reward_func/std": 3.225766181945801, "sampling/importance_sampling_ratio/max": 0.8340654969215393, "sampling/importance_sampling_ratio/mean": 0.5795985460281372, "sampling/importance_sampling_ratio/min": 0.12284219264984131, "sampling/sampling_logp_difference/max": 1.3642650842666626, "sampling/sampling_logp_difference/mean": 0.13924992084503174, "step": 93, "step_time": 9.02485181099928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4747870713472366, "epoch": 0.00094, "grad_norm": 0.23410014808177948, "kl": 0.32142616296187043, "learning_rate": 9.999999610853598e-06, "loss": -0.0206, "step": 94, "step_time": 5.07011508800133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 581.0, "completions/max_terminated_length": 581.0, "completions/mean_length": 157.28125, "completions/mean_terminated_length": 157.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1664393246173859, "epoch": 0.00095, "frac_reward_zero_std": 0.5, "grad_norm": 0.18488289415836334, "kl": 0.06721094995737076, "learning_rate": 9.999999597319077e-06, "loss": 0.0046, "num_tokens": 2086048.0, "reward": 3.17279314994812, "reward_std": 1.4421266317367554, "rewards/rollout_reward_func/mean": 3.17279314994812, "rewards/rollout_reward_func/std": 2.1261613368988037, "sampling/importance_sampling_ratio/max": 0.8425000309944153, "sampling/importance_sampling_ratio/mean": 0.7067235708236694, "sampling/importance_sampling_ratio/min": 0.31802764534950256, "sampling/sampling_logp_difference/max": 0.8441604971885681, "sampling/sampling_logp_difference/mean": 0.09930114448070526, "step": 95, "step_time": 7.825317881000046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.164254069328308, "epoch": 0.00096, "grad_norm": 0.21640680730342865, "kl": 0.06768974382430315, "learning_rate": 9.999999583553198e-06, "loss": 0.0046, "step": 96, "step_time": 4.247185356001864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.03125, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 288.03125, "completions/mean_terminated_length": 296.80645751953125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5986799895763397, "epoch": 0.00097, "frac_reward_zero_std": 0.0, "grad_norm": 0.6142963171005249, "kl": 0.37834287248551846, "learning_rate": 9.999999569555958e-06, "loss": -0.0185, "num_tokens": 2133194.0, "reward": 0.7202312350273132, "reward_std": 3.2530226707458496, "rewards/rollout_reward_func/mean": 0.7202312350273132, "rewards/rollout_reward_func/std": 3.295642375946045, "sampling/importance_sampling_ratio/max": 1.4411189556121826, "sampling/importance_sampling_ratio/mean": 0.6474981307983398, "sampling/importance_sampling_ratio/min": 0.00819697231054306, "sampling/sampling_logp_difference/max": 1.651160717010498, "sampling/sampling_logp_difference/mean": 0.18041104078292847, "step": 97, "step_time": 9.07756870799949 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.5846249610185623, "epoch": 0.00098, "grad_norm": 0.2721119821071625, "kl": 0.38701746240258217, "learning_rate": 9.99999955532736e-06, "loss": -0.0193, "step": 98, "step_time": 4.734437592000177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 232.5625, "completions/mean_terminated_length": 232.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1883085891604424, "epoch": 0.00099, "frac_reward_zero_std": 0.25, "grad_norm": 0.24683479964733124, "kl": 0.25273985508829355, "learning_rate": 9.999999540867401e-06, "loss": -0.0059, "num_tokens": 2175077.0, "reward": 1.3199145793914795, "reward_std": 2.06015682220459, "rewards/rollout_reward_func/mean": 1.3199145793914795, "rewards/rollout_reward_func/std": 3.2125861644744873, "sampling/importance_sampling_ratio/max": 0.8548383116722107, "sampling/importance_sampling_ratio/mean": 0.6793875694274902, "sampling/importance_sampling_ratio/min": 0.32304856181144714, "sampling/sampling_logp_difference/max": 0.7758204340934753, "sampling/sampling_logp_difference/mean": 0.10533061623573303, "step": 99, "step_time": 8.829210506999516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.1734348982572556, "epoch": 0.001, "grad_norm": 0.2670953869819641, "kl": 0.2414135718718171, "learning_rate": 9.999999526176084e-06, "loss": -0.0052, "step": 100, "step_time": 4.548869043001105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 568.0, "completions/max_terminated_length": 568.0, "completions/mean_length": 167.9375, "completions/mean_terminated_length": 167.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4711549282073975, "epoch": 0.00101, "frac_reward_zero_std": 0.25, "grad_norm": 0.44600358605384827, "kl": 0.6636635432951152, "learning_rate": 9.999999511253408e-06, "loss": -0.0235, "num_tokens": 2214634.0, "reward": 1.1731557846069336, "reward_std": 1.7159905433654785, "rewards/rollout_reward_func/mean": 1.1731557846069336, "rewards/rollout_reward_func/std": 3.5440027713775635, "sampling/importance_sampling_ratio/max": 0.8500983715057373, "sampling/importance_sampling_ratio/mean": 0.5969444513320923, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.9083564281463623, "sampling/sampling_logp_difference/mean": 0.17957772314548492, "step": 101, "step_time": 8.27366581899787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.4643440023064613, "epoch": 0.00102, "grad_norm": 0.3874184489250183, "kl": 0.831228859256953, "learning_rate": 9.99999949609937e-06, "loss": -0.0238, "step": 102, "step_time": 4.366456620999088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 211.125, "completions/mean_terminated_length": 211.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4426696449518204, "epoch": 0.00103, "frac_reward_zero_std": 0.0, "grad_norm": 0.6377424001693726, "kl": 0.609153538942337, "learning_rate": 9.999999480713976e-06, "loss": -0.0373, "num_tokens": 2258390.0, "reward": 1.1982793807983398, "reward_std": 2.668684959411621, "rewards/rollout_reward_func/mean": 1.1982793807983398, "rewards/rollout_reward_func/std": 3.4188103675842285, "sampling/importance_sampling_ratio/max": 0.9889529347419739, "sampling/importance_sampling_ratio/mean": 0.6130184531211853, "sampling/importance_sampling_ratio/min": 4.367172422277308e-18, "sampling/sampling_logp_difference/max": 15.44290828704834, "sampling/sampling_logp_difference/mean": 0.33652469515800476, "step": 103, "step_time": 9.140867977000198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.028245192719623446, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028245192719623446, "entropy": 1.4553562998771667, "epoch": 0.00104, "grad_norm": 1.3127943277359009, "kl": 0.6010738220065832, "learning_rate": 9.99999946509722e-06, "loss": -0.039, "step": 104, "step_time": 4.6115926009997565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 97.65625, "completions/mean_terminated_length": 97.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1370222494006157, "epoch": 0.00105, "frac_reward_zero_std": 0.5, "grad_norm": 0.384899377822876, "kl": 0.13393513206392527, "learning_rate": 9.999999449249107e-06, "loss": -0.0065, "num_tokens": 2292915.0, "reward": 3.3193359375, "reward_std": 1.5603448152542114, "rewards/rollout_reward_func/mean": 3.3193359375, "rewards/rollout_reward_func/std": 2.2768971920013428, "sampling/importance_sampling_ratio/max": 0.8583475351333618, "sampling/importance_sampling_ratio/mean": 0.7309748530387878, "sampling/importance_sampling_ratio/min": 0.08333110809326172, "sampling/sampling_logp_difference/max": 1.6220703125, "sampling/sampling_logp_difference/mean": 0.11095353215932846, "step": 105, "step_time": 8.083776143998875 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.1211768314242363, "epoch": 0.00106, "grad_norm": 0.18609535694122314, "kl": 0.13289557117968798, "learning_rate": 9.999999433169634e-06, "loss": -0.0066, "step": 106, "step_time": 5.051740761000474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 263.46875, "completions/mean_terminated_length": 263.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.353650487959385, "epoch": 0.00107, "frac_reward_zero_std": 0.25, "grad_norm": 0.466453492641449, "kl": 0.3108747862279415, "learning_rate": 9.999999416858801e-06, "loss": -0.0068, "num_tokens": 2337200.0, "reward": 2.899740695953369, "reward_std": 2.5667402744293213, "rewards/rollout_reward_func/mean": 2.899740695953369, "rewards/rollout_reward_func/std": 2.920200824737549, "sampling/importance_sampling_ratio/max": 1.4926074743270874, "sampling/importance_sampling_ratio/mean": 0.6507165431976318, "sampling/importance_sampling_ratio/min": 0.10119620710611343, "sampling/sampling_logp_difference/max": 1.736641764640808, "sampling/sampling_logp_difference/mean": 0.13990797102451324, "step": 107, "step_time": 8.825815408998096 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.3332961052656174, "epoch": 0.00108, "grad_norm": 0.2881586253643036, "kl": 0.34010628797113895, "learning_rate": 9.999999400316609e-06, "loss": -0.0078, "step": 108, "step_time": 4.627952240000923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 727.0, "completions/max_terminated_length": 727.0, "completions/mean_length": 214.0625, "completions/mean_terminated_length": 214.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2402007058262825, "epoch": 0.00109, "frac_reward_zero_std": 0.5, "grad_norm": 0.15843264758586884, "kl": 0.2749416660517454, "learning_rate": 9.999999383543059e-06, "loss": -0.0149, "num_tokens": 2378321.0, "reward": 3.2675328254699707, "reward_std": 1.5783872604370117, "rewards/rollout_reward_func/mean": 3.2675328254699707, "rewards/rollout_reward_func/std": 2.2358877658843994, "sampling/importance_sampling_ratio/max": 0.8686413168907166, "sampling/importance_sampling_ratio/mean": 0.7020887136459351, "sampling/importance_sampling_ratio/min": 0.17772793769836426, "sampling/sampling_logp_difference/max": 0.9853123426437378, "sampling/sampling_logp_difference/mean": 0.11397512257099152, "step": 109, "step_time": 8.286900273998981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.214910238981247, "epoch": 0.0011, "grad_norm": 0.1398218721151352, "kl": 0.2746839914470911, "learning_rate": 9.999999366538148e-06, "loss": -0.015, "step": 110, "step_time": 5.065564501997869 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 190.25, "completions/mean_terminated_length": 190.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2511784359812737, "epoch": 0.00111, "frac_reward_zero_std": 0.25, "grad_norm": 0.5258013010025024, "kl": 0.9097562991082668, "learning_rate": 9.999999349301878e-06, "loss": -0.0044, "num_tokens": 2420080.0, "reward": 1.6405235528945923, "reward_std": 2.262524127960205, "rewards/rollout_reward_func/mean": 1.6405235528945923, "rewards/rollout_reward_func/std": 3.039415121078491, "sampling/importance_sampling_ratio/max": 0.8686879277229309, "sampling/importance_sampling_ratio/mean": 0.701184868812561, "sampling/importance_sampling_ratio/min": 0.0910688117146492, "sampling/sampling_logp_difference/max": 1.3795437812805176, "sampling/sampling_logp_difference/mean": 0.11626594513654709, "step": 111, "step_time": 8.034462841000277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2453623786568642, "epoch": 0.00112, "grad_norm": 0.4173600673675537, "kl": 0.5955371828749776, "learning_rate": 9.999999331834249e-06, "loss": -0.0062, "step": 112, "step_time": 4.912073584002428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 374.875, "completions/mean_terminated_length": 374.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3247175514698029, "epoch": 0.00113, "frac_reward_zero_std": 0.0, "grad_norm": 0.358304888010025, "kl": 0.5801366977393627, "learning_rate": 9.99999931413526e-06, "loss": -0.0287, "num_tokens": 2470606.0, "reward": 1.169939637184143, "reward_std": 3.2278714179992676, "rewards/rollout_reward_func/mean": 1.169939637184143, "rewards/rollout_reward_func/std": 3.2247097492218018, "sampling/importance_sampling_ratio/max": 1.3752179145812988, "sampling/importance_sampling_ratio/mean": 0.6567858457565308, "sampling/importance_sampling_ratio/min": 1.119993284973939e-16, "sampling/sampling_logp_difference/max": 21.803834915161133, "sampling/sampling_logp_difference/mean": 0.3267441987991333, "step": 113, "step_time": 8.628134519001833 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.3166431784629822, "epoch": 0.00114, "grad_norm": 0.3047109544277191, "kl": 0.5824094768613577, "learning_rate": 9.999999296204912e-06, "loss": -0.0289, "step": 114, "step_time": 5.159040177999486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 404.65625, "completions/mean_terminated_length": 404.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8135550022125244, "epoch": 0.00115, "frac_reward_zero_std": 0.0, "grad_norm": 0.375212162733078, "kl": 0.656751710921526, "learning_rate": 9.999999278043205e-06, "loss": -0.0293, "num_tokens": 2523208.0, "reward": 1.935072422027588, "reward_std": 3.0513288974761963, "rewards/rollout_reward_func/mean": 1.935072422027588, "rewards/rollout_reward_func/std": 3.284796714782715, "sampling/importance_sampling_ratio/max": 0.8097750544548035, "sampling/importance_sampling_ratio/mean": 0.4798816740512848, "sampling/importance_sampling_ratio/min": 4.336374161626486e-20, "sampling/sampling_logp_difference/max": 17.531265258789062, "sampling/sampling_logp_difference/mean": 0.6329524517059326, "step": 115, "step_time": 8.729425274999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013557692524045706, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013557692524045706, "entropy": 1.798963040113449, "epoch": 0.00116, "grad_norm": 0.33327731490135193, "kl": 0.6851035915315151, "learning_rate": 9.99999925965014e-06, "loss": -0.0302, "step": 116, "step_time": 5.208779234002577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 201.34375, "completions/mean_terminated_length": 201.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2387980744242668, "epoch": 0.00117, "frac_reward_zero_std": 0.25, "grad_norm": 0.26032188534736633, "kl": 0.26366954296827316, "learning_rate": 9.999999241025713e-06, "loss": -0.026, "num_tokens": 2565650.0, "reward": 0.5629942417144775, "reward_std": 2.2255823612213135, "rewards/rollout_reward_func/mean": 0.5629942417144775, "rewards/rollout_reward_func/std": 3.3881778717041016, "sampling/importance_sampling_ratio/max": 0.9043019413948059, "sampling/importance_sampling_ratio/mean": 0.6852209568023682, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.9190664291381836, "sampling/sampling_logp_difference/mean": 0.13644807040691376, "step": 117, "step_time": 8.26714903300308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2233125492930412, "epoch": 0.00118, "grad_norm": 0.14706304669380188, "kl": 0.24578332714736462, "learning_rate": 9.99999922216993e-06, "loss": -0.0262, "step": 118, "step_time": 5.088276224001675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017361111473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017361111473292112, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 240.6875, "completions/mean_terminated_length": 240.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2477863281965256, "epoch": 0.00119, "frac_reward_zero_std": 0.0, "grad_norm": 1.6588044166564941, "kl": 2.22871932759881, "learning_rate": 9.999999203082784e-06, "loss": -0.0167, "num_tokens": 2610627.0, "reward": 1.1678540706634521, "reward_std": 1.690277338027954, "rewards/rollout_reward_func/mean": 1.1678540706634521, "rewards/rollout_reward_func/std": 3.471052646636963, "sampling/importance_sampling_ratio/max": 0.8998735547065735, "sampling/importance_sampling_ratio/mean": 0.6660821437835693, "sampling/importance_sampling_ratio/min": 0.09360919892787933, "sampling/sampling_logp_difference/max": 1.0761280059814453, "sampling/sampling_logp_difference/mean": 0.14003139734268188, "step": 119, "step_time": 8.34965265700339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.258188247680664, "epoch": 0.0012, "grad_norm": 0.615346372127533, "kl": 1.5200740173459053, "learning_rate": 9.999999183764282e-06, "loss": -0.0203, "step": 120, "step_time": 5.118856254997809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 332.90625, "completions/mean_terminated_length": 316.8000183105469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.355370245873928, "epoch": 0.00121, "frac_reward_zero_std": 0.25, "grad_norm": 0.37659114599227905, "kl": 0.4152047783136368, "learning_rate": 9.999999164214418e-06, "loss": -0.0225, "num_tokens": 2657801.0, "reward": 2.821887731552124, "reward_std": 1.866607666015625, "rewards/rollout_reward_func/mean": 2.821887731552124, "rewards/rollout_reward_func/std": 2.3679847717285156, "sampling/importance_sampling_ratio/max": 0.8719721436500549, "sampling/importance_sampling_ratio/mean": 0.6062960624694824, "sampling/importance_sampling_ratio/min": 0.04423459991812706, "sampling/sampling_logp_difference/max": 2.049921989440918, "sampling/sampling_logp_difference/mean": 0.1384788453578949, "step": 121, "step_time": 8.74563287699857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3356432616710663, "epoch": 0.00122, "grad_norm": 0.37042176723480225, "kl": 0.41443452797830105, "learning_rate": 9.999999144433197e-06, "loss": -0.0226, "step": 122, "step_time": 4.68855290799911 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 493.0, "completions/max_terminated_length": 493.0, "completions/mean_length": 139.96875, "completions/mean_terminated_length": 139.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9321612492203712, "epoch": 0.00123, "frac_reward_zero_std": 0.5, "grad_norm": 0.271528035402298, "kl": 0.26738199731335044, "learning_rate": 9.999999124420615e-06, "loss": 0.0056, "num_tokens": 2696422.0, "reward": 3.243018627166748, "reward_std": 1.166715145111084, "rewards/rollout_reward_func/mean": 3.243018627166748, "rewards/rollout_reward_func/std": 2.236941337585449, "sampling/importance_sampling_ratio/max": 0.886531412601471, "sampling/importance_sampling_ratio/mean": 0.7436658143997192, "sampling/importance_sampling_ratio/min": 0.07157089561223984, "sampling/sampling_logp_difference/max": 1.1098639965057373, "sampling/sampling_logp_difference/mean": 0.0943179801106453, "step": 123, "step_time": 8.067842939995899 }, { "clip_ratio/high_max": 0.04087752569466829, "clip_ratio/high_mean": 0.020438762847334146, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027383207343518734, "entropy": 0.922589048743248, "epoch": 0.00124, "grad_norm": 0.13495482504367828, "kl": 0.2979471506550908, "learning_rate": 9.999999104176675e-06, "loss": 0.0054, "step": 124, "step_time": 4.672525504998703 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.025426136795431376, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03237058129161596, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 184.53125, "completions/mean_terminated_length": 184.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.310823455452919, "epoch": 0.00125, "frac_reward_zero_std": 0.25, "grad_norm": 0.4094269573688507, "kl": 0.3710424145683646, "learning_rate": 9.999999083701375e-06, "loss": -0.0276, "num_tokens": 2738446.0, "reward": 0.312050461769104, "reward_std": 1.6147348880767822, "rewards/rollout_reward_func/mean": 0.312050461769104, "rewards/rollout_reward_func/std": 3.2261667251586914, "sampling/importance_sampling_ratio/max": 0.8833670020103455, "sampling/importance_sampling_ratio/mean": 0.6620410680770874, "sampling/importance_sampling_ratio/min": 0.03429631516337395, "sampling/sampling_logp_difference/max": 2.63641619682312, "sampling/sampling_logp_difference/mean": 0.15128785371780396, "step": 125, "step_time": 8.423547819003943 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.02130681835114956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028251262847334146, "entropy": 1.3082057759165764, "epoch": 0.00126, "grad_norm": 0.4697313904762268, "kl": 0.3471468612551689, "learning_rate": 9.999999062994716e-06, "loss": -0.0277, "step": 126, "step_time": 4.655042839000089 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 216.46875, "completions/mean_terminated_length": 216.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0506554767489433, "epoch": 0.00127, "frac_reward_zero_std": 0.25, "grad_norm": 0.252723753452301, "kl": 0.2694012215360999, "learning_rate": 9.999999042056698e-06, "loss": 0.0053, "num_tokens": 2780917.0, "reward": 3.346662759780884, "reward_std": 1.3463809490203857, "rewards/rollout_reward_func/mean": 3.346662759780884, "rewards/rollout_reward_func/std": 2.114306926727295, "sampling/importance_sampling_ratio/max": 1.732710838317871, "sampling/importance_sampling_ratio/mean": 0.7513864040374756, "sampling/importance_sampling_ratio/min": 1.3906487756820476e-20, "sampling/sampling_logp_difference/max": 19.309947967529297, "sampling/sampling_logp_difference/mean": 0.371071994304657, "step": 127, "step_time": 8.698544781003875 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 1.0367054492235184, "epoch": 0.00128, "grad_norm": 0.6366693377494812, "kl": 0.2684983732178807, "learning_rate": 9.99999902088732e-06, "loss": 0.0062, "step": 128, "step_time": 4.463675903001786 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 197.09375, "completions/mean_terminated_length": 197.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2090764343738556, "epoch": 0.00129, "frac_reward_zero_std": 0.25, "grad_norm": 0.930417537689209, "kl": 0.8662008149549365, "learning_rate": 9.999998999486583e-06, "loss": -0.0018, "num_tokens": 2821649.0, "reward": 0.5058659315109253, "reward_std": 1.750098705291748, "rewards/rollout_reward_func/mean": 0.5058659315109253, "rewards/rollout_reward_func/std": 3.3122169971466064, "sampling/importance_sampling_ratio/max": 2.134777307510376, "sampling/importance_sampling_ratio/mean": 0.7420268058776855, "sampling/importance_sampling_ratio/min": 0.06644809246063232, "sampling/sampling_logp_difference/max": 1.862166404724121, "sampling/sampling_logp_difference/mean": 0.14809246361255646, "step": 129, "step_time": 8.425946331000887 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020312500186264515, "entropy": 1.2335382550954819, "epoch": 0.0013, "grad_norm": 0.5553529262542725, "kl": 0.5725571596994996, "learning_rate": 9.999998977854486e-06, "loss": -0.0033, "step": 130, "step_time": 4.44442986499962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 286.125, "completions/mean_terminated_length": 286.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9816871136426926, "epoch": 0.00131, "frac_reward_zero_std": 0.25, "grad_norm": 0.6711408495903015, "kl": 0.3799923127517104, "learning_rate": 9.99999895599103e-06, "loss": -0.021, "num_tokens": 2867656.0, "reward": 2.7903671264648438, "reward_std": 2.359102964401245, "rewards/rollout_reward_func/mean": 2.7903671264648438, "rewards/rollout_reward_func/std": 2.694920778274536, "sampling/importance_sampling_ratio/max": 0.8901586532592773, "sampling/importance_sampling_ratio/mean": 0.7296309471130371, "sampling/importance_sampling_ratio/min": 0.266862154006958, "sampling/sampling_logp_difference/max": 1.0548845529556274, "sampling/sampling_logp_difference/mean": 0.08279058337211609, "step": 131, "step_time": 9.087207767999644 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.990684986114502, "epoch": 0.00132, "grad_norm": 0.27610498666763306, "kl": 0.3753572916612029, "learning_rate": 9.999998933896215e-06, "loss": -0.0213, "step": 132, "step_time": 4.709462077000353 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854166837409139, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 229.28125, "completions/mean_terminated_length": 229.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2144686877727509, "epoch": 0.00133, "frac_reward_zero_std": 0.0, "grad_norm": 0.6727531552314758, "kl": 0.35478102788329124, "learning_rate": 9.999998911570041e-06, "loss": -0.0404, "num_tokens": 2911654.0, "reward": -0.07718247175216675, "reward_std": 2.7860493659973145, "rewards/rollout_reward_func/mean": -0.07718247175216675, "rewards/rollout_reward_func/std": 3.316448450088501, "sampling/importance_sampling_ratio/max": 1.0132944583892822, "sampling/importance_sampling_ratio/mean": 0.6865078210830688, "sampling/importance_sampling_ratio/min": 2.734994034633148e-13, "sampling/sampling_logp_difference/max": 12.4773530960083, "sampling/sampling_logp_difference/mean": 0.2706117033958435, "step": 133, "step_time": 9.26999236699885 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01714015193283558, "entropy": 1.2061478644609451, "epoch": 0.00134, "grad_norm": 0.521673858165741, "kl": 0.3528296910226345, "learning_rate": 9.999998889012509e-06, "loss": -0.0411, "step": 134, "step_time": 4.646553983997364 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009226190624758601, "completions/clipped_ratio": 0.03125, "completions/max_length": 690.0, "completions/max_terminated_length": 690.0, "completions/mean_length": 327.65625, "completions/mean_terminated_length": 337.70965576171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9923733323812485, "epoch": 0.00135, "frac_reward_zero_std": 0.0, "grad_norm": 3.0150668621063232, "kl": 3.127168197184801, "learning_rate": 9.999998866223617e-06, "loss": -0.0222, "num_tokens": 2960492.0, "reward": 2.5414891242980957, "reward_std": 1.8814085721969604, "rewards/rollout_reward_func/mean": 2.5414891242980957, "rewards/rollout_reward_func/std": 2.6769306659698486, "sampling/importance_sampling_ratio/max": 0.898768424987793, "sampling/importance_sampling_ratio/mean": 0.6810963153839111, "sampling/importance_sampling_ratio/min": 0.002466224366798997, "sampling/sampling_logp_difference/max": 2.812283515930176, "sampling/sampling_logp_difference/mean": 0.11650547385215759, "step": 135, "step_time": 8.721548871000778 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01490800897590816, "entropy": 0.9959061443805695, "epoch": 0.00136, "grad_norm": 1.79884934425354, "kl": 1.9692962355911732, "learning_rate": 9.999998843203364e-06, "loss": -0.0257, "step": 136, "step_time": 4.691219431000718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.03125, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 242.84375, "completions/mean_terminated_length": 237.19354248046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3220156133174896, "epoch": 0.00137, "frac_reward_zero_std": 0.0, "grad_norm": 0.4622597396373749, "kl": 0.4189475532621145, "learning_rate": 9.999998819951753e-06, "loss": -0.016, "num_tokens": 3004600.0, "reward": 0.25906217098236084, "reward_std": 2.6490468978881836, "rewards/rollout_reward_func/mean": 0.25906217098236084, "rewards/rollout_reward_func/std": 3.2965714931488037, "sampling/importance_sampling_ratio/max": 0.9104325175285339, "sampling/importance_sampling_ratio/mean": 0.6504316329956055, "sampling/importance_sampling_ratio/min": 0.010162248276174068, "sampling/sampling_logp_difference/max": 1.9223031997680664, "sampling/sampling_logp_difference/mean": 0.16847951710224152, "step": 137, "step_time": 8.799324890002026 }, { "clip_ratio/high_max": 0.026041666977107525, "clip_ratio/high_mean": 0.013020833488553762, "clip_ratio/low_mean": 0.024479167070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000102445483, "entropy": 1.3606244400143623, "epoch": 0.00138, "grad_norm": 0.2303469032049179, "kl": 0.4591524535790086, "learning_rate": 9.999998796468782e-06, "loss": -0.016, "step": 138, "step_time": 4.543890040999031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 334.5625, "completions/mean_terminated_length": 334.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3044259995222092, "epoch": 0.00139, "frac_reward_zero_std": 0.0, "grad_norm": 0.5992356538772583, "kl": 0.41544256173074245, "learning_rate": 9.999998772754452e-06, "loss": -0.0268, "num_tokens": 3054760.0, "reward": 1.8450011014938354, "reward_std": 2.890075206756592, "rewards/rollout_reward_func/mean": 1.8450011014938354, "rewards/rollout_reward_func/std": 3.4126341342926025, "sampling/importance_sampling_ratio/max": 1.8168551921844482, "sampling/importance_sampling_ratio/mean": 0.7018195986747742, "sampling/importance_sampling_ratio/min": 0.0883556678891182, "sampling/sampling_logp_difference/max": 1.1391968727111816, "sampling/sampling_logp_difference/mean": 0.1297687441110611, "step": 139, "step_time": 8.539788423999198 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02291666716337204, "entropy": 1.301436260342598, "epoch": 0.0014, "grad_norm": 0.48136088252067566, "kl": 0.44433126226067543, "learning_rate": 9.999998748808764e-06, "loss": -0.0277, "step": 140, "step_time": 5.166287205000117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 267.1875, "completions/mean_terminated_length": 267.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4421513974666595, "epoch": 0.00141, "frac_reward_zero_std": 0.25, "grad_norm": 0.4028409719467163, "kl": 0.32596446573734283, "learning_rate": 9.999998724631715e-06, "loss": -0.0188, "num_tokens": 3100456.0, "reward": 1.3944412469863892, "reward_std": 2.4025685787200928, "rewards/rollout_reward_func/mean": 1.3944412469863892, "rewards/rollout_reward_func/std": 3.2429115772247314, "sampling/importance_sampling_ratio/max": 1.0816926956176758, "sampling/importance_sampling_ratio/mean": 0.6859432458877563, "sampling/importance_sampling_ratio/min": 0.00025997235206887126, "sampling/sampling_logp_difference/max": 3.583299398422241, "sampling/sampling_logp_difference/mean": 0.17843911051750183, "step": 141, "step_time": 8.531648317999498 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.03072916716337204, "clip_ratio/low_min": 0.02083333395421505, "clip_ratio/region_mean": 0.0369791672565043, "entropy": 1.4622048884630203, "epoch": 0.00142, "grad_norm": 0.40485310554504395, "kl": 0.31938351318240166, "learning_rate": 9.999998700223308e-06, "loss": -0.0199, "step": 142, "step_time": 5.2098391899980925 }, { "clip_ratio/high_max": 0.11069444566965103, "clip_ratio/high_mean": 0.055347222834825516, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0622916673310101, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 335.46875, "completions/mean_terminated_length": 335.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5268013775348663, "epoch": 0.00143, "frac_reward_zero_std": 0.0, "grad_norm": 0.2813084125518799, "kl": 0.41015079617500305, "learning_rate": 9.999998675583542e-06, "loss": -0.0177, "num_tokens": 3150184.0, "reward": 2.1760172843933105, "reward_std": 3.1307291984558105, "rewards/rollout_reward_func/mean": 2.1760172843933105, "rewards/rollout_reward_func/std": 3.1140639781951904, "sampling/importance_sampling_ratio/max": 0.9038472771644592, "sampling/importance_sampling_ratio/mean": 0.5733640789985657, "sampling/importance_sampling_ratio/min": 1.9825093989340068e-17, "sampling/sampling_logp_difference/max": 18.09393310546875, "sampling/sampling_logp_difference/mean": 0.5492222309112549, "step": 143, "step_time": 8.656024183999762 }, { "clip_ratio/high_max": 0.06208333373069763, "clip_ratio/high_mean": 0.031041666865348816, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031041666865348816, "entropy": 1.5699512660503387, "epoch": 0.00144, "grad_norm": 0.32212403416633606, "kl": 0.4178838785737753, "learning_rate": 9.999998650712415e-06, "loss": -0.0181, "step": 144, "step_time": 5.197006226997473 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029687500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 211.15625, "completions/mean_terminated_length": 211.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4725349247455597, "epoch": 0.00145, "frac_reward_zero_std": 0.0, "grad_norm": 0.9448748230934143, "kl": 0.7301181349903345, "learning_rate": 9.999998625609931e-06, "loss": -0.0369, "num_tokens": 3194250.0, "reward": 0.07949870824813843, "reward_std": 2.8198013305664062, "rewards/rollout_reward_func/mean": 0.07949870824813843, "rewards/rollout_reward_func/std": 3.4731156826019287, "sampling/importance_sampling_ratio/max": 1.5322343111038208, "sampling/importance_sampling_ratio/mean": 0.7367376089096069, "sampling/importance_sampling_ratio/min": 0.0559135340154171, "sampling/sampling_logp_difference/max": 1.8956400156021118, "sampling/sampling_logp_difference/mean": 0.17801159620285034, "step": 145, "step_time": 8.227982959000656 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.05572916707023978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07135416707023978, "entropy": 1.5054499953985214, "epoch": 0.00146, "grad_norm": 0.6633767485618591, "kl": 0.6683848723769188, "learning_rate": 9.999998600276087e-06, "loss": -0.0381, "step": 146, "step_time": 5.071890896999321 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.021701388992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027951389085501432, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 383.53125, "completions/mean_terminated_length": 383.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7917729020118713, "epoch": 0.00147, "frac_reward_zero_std": 0.0, "grad_norm": 0.5746680498123169, "kl": 1.2162466496229172, "learning_rate": 9.999998574710883e-06, "loss": -0.0123, "num_tokens": 3245994.0, "reward": -0.28947412967681885, "reward_std": 2.5466694831848145, "rewards/rollout_reward_func/mean": -0.28947412967681885, "rewards/rollout_reward_func/std": 2.691091537475586, "sampling/importance_sampling_ratio/max": 0.8393126130104065, "sampling/importance_sampling_ratio/mean": 0.5239529609680176, "sampling/importance_sampling_ratio/min": 0.052705317735672, "sampling/sampling_logp_difference/max": 2.442941188812256, "sampling/sampling_logp_difference/mean": 0.2120271921157837, "step": 147, "step_time": 8.738407132002976 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027951389085501432, "entropy": 1.8217622637748718, "epoch": 0.00148, "grad_norm": 0.3492240309715271, "kl": 0.9999618865549564, "learning_rate": 9.999998548914318e-06, "loss": -0.0134, "step": 148, "step_time": 5.281432364001375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 146.5625, "completions/mean_terminated_length": 150.77418518066406, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1756278946995735, "epoch": 0.00149, "frac_reward_zero_std": 0.25, "grad_norm": 0.3821263611316681, "kl": 0.15307023422792554, "learning_rate": 9.999998522886397e-06, "loss": -0.0286, "num_tokens": 3284052.0, "reward": 2.6114084720611572, "reward_std": 2.0032644271850586, "rewards/rollout_reward_func/mean": 2.6114084720611572, "rewards/rollout_reward_func/std": 2.3627264499664307, "sampling/importance_sampling_ratio/max": 1.6664572954177856, "sampling/importance_sampling_ratio/mean": 0.7434263825416565, "sampling/importance_sampling_ratio/min": 0.01095191203057766, "sampling/sampling_logp_difference/max": 1.7267346382141113, "sampling/sampling_logp_difference/mean": 0.14059627056121826, "step": 149, "step_time": 8.371668487001443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01215277798473835, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "entropy": 1.1505991742014885, "epoch": 0.0015, "grad_norm": 0.2584988474845886, "kl": 0.18450950738042593, "learning_rate": 9.999998496627115e-06, "loss": -0.0299, "step": 150, "step_time": 5.117321968004035 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "completions/clipped_ratio": 0.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 308.5, "completions/mean_terminated_length": 308.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4419046491384506, "epoch": 0.00151, "frac_reward_zero_std": 0.0, "grad_norm": 0.8367948532104492, "kl": 0.6049305982887745, "learning_rate": 9.999998470136475e-06, "loss": -0.0397, "num_tokens": 3333233.0, "reward": 0.6236060857772827, "reward_std": 2.532585859298706, "rewards/rollout_reward_func/mean": 0.6236060857772827, "rewards/rollout_reward_func/std": 3.177070379257202, "sampling/importance_sampling_ratio/max": 1.3900566101074219, "sampling/importance_sampling_ratio/mean": 0.6832244396209717, "sampling/importance_sampling_ratio/min": 0.04086190089583397, "sampling/sampling_logp_difference/max": 1.7017148733139038, "sampling/sampling_logp_difference/mean": 0.16128653287887573, "step": 151, "step_time": 8.290639024002303 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 1.39664925634861, "epoch": 0.00152, "grad_norm": 0.30858343839645386, "kl": 0.6339622512459755, "learning_rate": 9.999998443414474e-06, "loss": -0.0412, "step": 152, "step_time": 4.561467573996197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "completions/clipped_ratio": 0.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 307.28125, "completions/mean_terminated_length": 307.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8131889551877975, "epoch": 0.00153, "frac_reward_zero_std": 0.0, "grad_norm": 0.39587199687957764, "kl": 0.6584779666736722, "learning_rate": 9.999998416461115e-06, "loss": -0.0579, "num_tokens": 3380136.0, "reward": -0.964415967464447, "reward_std": 1.724520206451416, "rewards/rollout_reward_func/mean": -0.964415967464447, "rewards/rollout_reward_func/std": 2.456542491912842, "sampling/importance_sampling_ratio/max": 2.2638700008392334, "sampling/importance_sampling_ratio/mean": 0.5994124412536621, "sampling/importance_sampling_ratio/min": 0.0469188429415226, "sampling/sampling_logp_difference/max": 2.2139639854431152, "sampling/sampling_logp_difference/mean": 0.23710989952087402, "step": 153, "step_time": 9.147835786998257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.033234127797186375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033234127797186375, "entropy": 1.8073970526456833, "epoch": 0.00154, "grad_norm": 0.3911881148815155, "kl": 0.6852459143847227, "learning_rate": 9.999998389276397e-06, "loss": -0.0591, "step": 154, "step_time": 4.698305214998982 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025994318071752787, "completions/clipped_ratio": 0.0, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 389.0625, "completions/mean_terminated_length": 389.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2222143709659576, "epoch": 0.00155, "frac_reward_zero_std": 0.0, "grad_norm": 0.3362582325935364, "kl": 0.6404581721872091, "learning_rate": 9.999998361860319e-06, "loss": -0.0334, "num_tokens": 3431099.0, "reward": 2.6694071292877197, "reward_std": 2.5446906089782715, "rewards/rollout_reward_func/mean": 2.6694071292877197, "rewards/rollout_reward_func/std": 3.290167808532715, "sampling/importance_sampling_ratio/max": 0.8222158551216125, "sampling/importance_sampling_ratio/mean": 0.5994908213615417, "sampling/importance_sampling_ratio/min": 0.21654866635799408, "sampling/sampling_logp_difference/max": 1.1304153203964233, "sampling/sampling_logp_difference/mean": 0.14152154326438904, "step": 155, "step_time": 9.200423039999805 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.027556818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040056818164885044, "entropy": 1.2189196199178696, "epoch": 0.00156, "grad_norm": 0.30689436197280884, "kl": 0.6335372105240822, "learning_rate": 9.99999833421288e-06, "loss": -0.0342, "step": 156, "step_time": 4.743337452999185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 316.46875, "completions/mean_terminated_length": 316.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5837257876992226, "epoch": 0.00157, "frac_reward_zero_std": 0.0, "grad_norm": 2.0435855388641357, "kl": 0.8799369372427464, "learning_rate": 9.999998306334084e-06, "loss": -0.0204, "num_tokens": 3479461.0, "reward": -0.3385336399078369, "reward_std": 2.382631301879883, "rewards/rollout_reward_func/mean": -0.3385336399078369, "rewards/rollout_reward_func/std": 3.1213438510894775, "sampling/importance_sampling_ratio/max": 1.8760546445846558, "sampling/importance_sampling_ratio/mean": 0.6493722796440125, "sampling/importance_sampling_ratio/min": 0.022017143666744232, "sampling/sampling_logp_difference/max": 1.9306042194366455, "sampling/sampling_logp_difference/mean": 0.21119727194309235, "step": 157, "step_time": 9.055644448002568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02881944552063942, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.02881944552063942, "entropy": 1.5248295441269875, "epoch": 0.00158, "grad_norm": 0.31136661767959595, "kl": 0.8541321940720081, "learning_rate": 9.99999827822393e-06, "loss": -0.0248, "step": 158, "step_time": 4.748511123998469 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 296.625, "completions/mean_terminated_length": 296.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1072182357311249, "epoch": 0.00159, "frac_reward_zero_std": 0.25, "grad_norm": 0.8541929721832275, "kl": 1.5142261497676373, "learning_rate": 9.999998249882414e-06, "loss": -0.0258, "num_tokens": 3526390.0, "reward": 1.9590237140655518, "reward_std": 2.3490328788757324, "rewards/rollout_reward_func/mean": 1.9590237140655518, "rewards/rollout_reward_func/std": 3.302567720413208, "sampling/importance_sampling_ratio/max": 1.1023741960525513, "sampling/importance_sampling_ratio/mean": 0.7346677780151367, "sampling/importance_sampling_ratio/min": 0.02572685480117798, "sampling/sampling_logp_difference/max": 3.1846976280212402, "sampling/sampling_logp_difference/mean": 0.11066687107086182, "step": 159, "step_time": 8.974032844000249 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.024553571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030235390178859234, "entropy": 1.0959526598453522, "epoch": 0.0016, "grad_norm": 0.5121492147445679, "kl": 1.027910215780139, "learning_rate": 9.999998221309542e-06, "loss": -0.0283, "step": 160, "step_time": 4.727157241999521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 239.5, "completions/mean_terminated_length": 239.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1496014147996902, "epoch": 0.00161, "frac_reward_zero_std": 0.0, "grad_norm": 0.8561328649520874, "kl": 1.3528170138597488, "learning_rate": 9.999998192505309e-06, "loss": -0.0293, "num_tokens": 3570110.0, "reward": 0.4218023419380188, "reward_std": 2.1406993865966797, "rewards/rollout_reward_func/mean": 0.4218023419380188, "rewards/rollout_reward_func/std": 3.4171457290649414, "sampling/importance_sampling_ratio/max": 0.9382893443107605, "sampling/importance_sampling_ratio/mean": 0.7126387357711792, "sampling/importance_sampling_ratio/min": 0.06139320880174637, "sampling/sampling_logp_difference/max": 1.7130622863769531, "sampling/sampling_logp_difference/mean": 0.13841870427131653, "step": 161, "step_time": 8.80734174400095 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.1979045495390892, "epoch": 0.00162, "grad_norm": 0.5170145630836487, "kl": 0.7765293065458536, "learning_rate": 9.999998163469716e-06, "loss": -0.0305, "step": 162, "step_time": 4.437624299998788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 326.1875, "completions/mean_terminated_length": 326.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2955261319875717, "epoch": 0.00163, "frac_reward_zero_std": 0.0, "grad_norm": 0.2643387019634247, "kl": 0.6405982952564955, "learning_rate": 9.999998134202764e-06, "loss": -0.0264, "num_tokens": 3618124.0, "reward": -0.2343170940876007, "reward_std": 2.0977234840393066, "rewards/rollout_reward_func/mean": -0.2343170940876007, "rewards/rollout_reward_func/std": 3.0833513736724854, "sampling/importance_sampling_ratio/max": 0.9166899919509888, "sampling/importance_sampling_ratio/mean": 0.6636651754379272, "sampling/importance_sampling_ratio/min": 0.027080586180090904, "sampling/sampling_logp_difference/max": 2.392733097076416, "sampling/sampling_logp_difference/mean": 0.1527712047100067, "step": 163, "step_time": 9.191815565001889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3217408657073975, "epoch": 0.00164, "grad_norm": 0.2938953638076782, "kl": 0.6285237902775407, "learning_rate": 9.999998104704453e-06, "loss": -0.0262, "step": 164, "step_time": 4.709861910001564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 314.25, "completions/mean_terminated_length": 314.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6975739672780037, "epoch": 0.00165, "frac_reward_zero_std": 0.0, "grad_norm": 0.627184271812439, "kl": 0.7695172298699617, "learning_rate": 9.999998074974785e-06, "loss": -0.0211, "num_tokens": 3666527.0, "reward": 1.590512990951538, "reward_std": 1.9791123867034912, "rewards/rollout_reward_func/mean": 1.590512990951538, "rewards/rollout_reward_func/std": 3.3344736099243164, "sampling/importance_sampling_ratio/max": 0.8019801378250122, "sampling/importance_sampling_ratio/mean": 0.5018174052238464, "sampling/importance_sampling_ratio/min": 0.05402408167719841, "sampling/sampling_logp_difference/max": 1.6203422546386719, "sampling/sampling_logp_difference/mean": 0.19480445981025696, "step": 165, "step_time": 9.032972052997138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01957070827484131, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.01957070827484131, "entropy": 1.7281949520111084, "epoch": 0.00166, "grad_norm": 0.40164005756378174, "kl": 0.7744492739439011, "learning_rate": 9.999998045013754e-06, "loss": -0.0213, "step": 166, "step_time": 4.538681028996507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010233918204903603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010233918204903603, "completions/clipped_ratio": 0.03125, "completions/max_length": 582.0, "completions/max_terminated_length": 582.0, "completions/mean_length": 207.15625, "completions/mean_terminated_length": 213.32257080078125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3253183141350746, "epoch": 0.00167, "frac_reward_zero_std": 0.0, "grad_norm": 0.11104938387870789, "kl": 0.4776045195758343, "learning_rate": 9.999998014821366e-06, "loss": -0.0455, "num_tokens": 3708653.0, "reward": 2.0800609588623047, "reward_std": 2.500545024871826, "rewards/rollout_reward_func/mean": 2.0800609588623047, "rewards/rollout_reward_func/std": 3.134835720062256, "sampling/importance_sampling_ratio/max": 0.9213301539421082, "sampling/importance_sampling_ratio/mean": 0.662924587726593, "sampling/importance_sampling_ratio/min": 0.009712174534797668, "sampling/sampling_logp_difference/max": 1.9158085584640503, "sampling/sampling_logp_difference/mean": 0.16099217534065247, "step": 167, "step_time": 8.562418814000921 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.01717836270108819, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026106934528797865, "entropy": 1.320396900177002, "epoch": 0.00168, "grad_norm": 0.10037848353385925, "kl": 0.5044317813590169, "learning_rate": 9.999997984397618e-06, "loss": -0.0454, "step": 168, "step_time": 4.430021971000315 }, { "clip_ratio/high_max": 0.030357143841683865, "clip_ratio/high_mean": 0.015178571920841932, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015178571920841932, "completions/clipped_ratio": 0.0625, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 416.53125, "completions/mean_terminated_length": 411.63336181640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.7116235047578812, "epoch": 0.00169, "frac_reward_zero_std": 0.0, "grad_norm": 0.4068565368652344, "kl": 0.7224480472505093, "learning_rate": 9.999997953742511e-06, "loss": -0.0201, "num_tokens": 3761679.0, "reward": 1.020938754081726, "reward_std": 2.9724457263946533, "rewards/rollout_reward_func/mean": 1.020938754081726, "rewards/rollout_reward_func/std": 3.237048864364624, "sampling/importance_sampling_ratio/max": 0.9660463333129883, "sampling/importance_sampling_ratio/mean": 0.5537210702896118, "sampling/importance_sampling_ratio/min": 3.476083657005802e-05, "sampling/sampling_logp_difference/max": 1.962423324584961, "sampling/sampling_logp_difference/mean": 0.22366788983345032, "step": 169, "step_time": 9.150613829000577 }, { "clip_ratio/high_max": 0.04285714402794838, "clip_ratio/high_mean": 0.02142857201397419, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02142857201397419, "entropy": 1.6991557851433754, "epoch": 0.0017, "grad_norm": 0.41731277108192444, "kl": 0.7235119231045246, "learning_rate": 9.999997922856044e-06, "loss": -0.0206, "step": 170, "step_time": 4.583088712997778 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 312.90625, "completions/mean_terminated_length": 312.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0834757313132286, "epoch": 0.00171, "frac_reward_zero_std": 0.25, "grad_norm": 0.5455321073532104, "kl": 0.5134917292743921, "learning_rate": 9.999997891738219e-06, "loss": -0.0115, "num_tokens": 3808498.0, "reward": 3.3664751052856445, "reward_std": 1.5914669036865234, "rewards/rollout_reward_func/mean": 3.3664751052856445, "rewards/rollout_reward_func/std": 2.424177885055542, "sampling/importance_sampling_ratio/max": 0.927939772605896, "sampling/importance_sampling_ratio/mean": 0.6779882907867432, "sampling/importance_sampling_ratio/min": 0.061545480042696, "sampling/sampling_logp_difference/max": 1.7264273166656494, "sampling/sampling_logp_difference/mean": 0.11321510374546051, "step": 171, "step_time": 8.323166243000742 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 1.0771725848317146, "epoch": 0.00172, "grad_norm": 0.4494301378726959, "kl": 0.5340264840051532, "learning_rate": 9.999997860389035e-06, "loss": -0.0131, "step": 172, "step_time": 4.979321669998171 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 340.125, "completions/mean_terminated_length": 340.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1932927370071411, "epoch": 0.00173, "frac_reward_zero_std": 0.0, "grad_norm": 0.30508238077163696, "kl": 0.5225230045616627, "learning_rate": 9.99999782880849e-06, "loss": -0.0219, "num_tokens": 3857096.0, "reward": 0.9056083559989929, "reward_std": 2.209455728530884, "rewards/rollout_reward_func/mean": 0.9056083559989929, "rewards/rollout_reward_func/std": 3.2102365493774414, "sampling/importance_sampling_ratio/max": 0.926489531993866, "sampling/importance_sampling_ratio/mean": 0.6691128015518188, "sampling/importance_sampling_ratio/min": 0.1564565896987915, "sampling/sampling_logp_difference/max": 2.3955295085906982, "sampling/sampling_logp_difference/mean": 0.1307830661535263, "step": 173, "step_time": 8.621828668003218 }, { "clip_ratio/high_max": 0.028273810632526875, "clip_ratio/high_mean": 0.014136905316263437, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021081349812448025, "entropy": 1.1754336431622505, "epoch": 0.00174, "grad_norm": 0.26997947692871094, "kl": 0.5316643565893173, "learning_rate": 9.999997796996588e-06, "loss": -0.022, "step": 174, "step_time": 5.268452862997947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.03125, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 191.71875, "completions/mean_terminated_length": 179.74192810058594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3498205840587616, "epoch": 0.00175, "frac_reward_zero_std": 0.25, "grad_norm": 0.6701794862747192, "kl": 0.3257319461554289, "learning_rate": 9.999997764953326e-06, "loss": -0.0026, "num_tokens": 3898363.0, "reward": 2.499316930770874, "reward_std": 2.2503104209899902, "rewards/rollout_reward_func/mean": 2.499316930770874, "rewards/rollout_reward_func/std": 2.7159764766693115, "sampling/importance_sampling_ratio/max": 0.9568949341773987, "sampling/importance_sampling_ratio/mean": 0.6555562019348145, "sampling/importance_sampling_ratio/min": 3.4279916375026616e-17, "sampling/sampling_logp_difference/max": 20.286901473999023, "sampling/sampling_logp_difference/mean": 0.3770790696144104, "step": 175, "step_time": 8.269729329002075 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.3085760176181793, "epoch": 0.00176, "grad_norm": 0.5897911190986633, "kl": 0.33001732360571623, "learning_rate": 9.999997732678706e-06, "loss": -0.0036, "step": 176, "step_time": 4.949120727002082 }, { "clip_ratio/high_max": 0.020312500186264515, "clip_ratio/high_mean": 0.010156250093132257, "clip_ratio/low_mean": 0.013494318351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023650568444281816, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 344.4375, "completions/mean_terminated_length": 344.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.534710668027401, "epoch": 0.00177, "frac_reward_zero_std": 0.0, "grad_norm": 0.4783516824245453, "kl": 1.0103277955204248, "learning_rate": 9.999997700172724e-06, "loss": -0.0416, "num_tokens": 3948163.0, "reward": 1.4369618892669678, "reward_std": 3.0398616790771484, "rewards/rollout_reward_func/mean": 1.4369618892669678, "rewards/rollout_reward_func/std": 3.267970561981201, "sampling/importance_sampling_ratio/max": 0.9588543176651001, "sampling/importance_sampling_ratio/mean": 0.599725604057312, "sampling/importance_sampling_ratio/min": 1.1673103347646652e-13, "sampling/sampling_logp_difference/max": 16.579803466796875, "sampling/sampling_logp_difference/mean": 0.3121737539768219, "step": 177, "step_time": 8.731376670004465 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.037365846801549196, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.043615846894681454, "entropy": 1.5510089695453644, "epoch": 0.00178, "grad_norm": 0.33265531063079834, "kl": 1.0262525640428066, "learning_rate": 9.999997667435383e-06, "loss": -0.0422, "step": 178, "step_time": 5.071602681999138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 158.53125, "completions/mean_terminated_length": 158.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0116889923810959, "epoch": 0.00179, "frac_reward_zero_std": 0.0, "grad_norm": 0.32558098435401917, "kl": 0.6755421012639999, "learning_rate": 9.999997634466684e-06, "loss": 0.003, "num_tokens": 3989375.0, "reward": 1.8584270477294922, "reward_std": 2.3437795639038086, "rewards/rollout_reward_func/mean": 1.8584270477294922, "rewards/rollout_reward_func/std": 2.5701847076416016, "sampling/importance_sampling_ratio/max": 0.9314047694206238, "sampling/importance_sampling_ratio/mean": 0.7484275698661804, "sampling/importance_sampling_ratio/min": 0.03350019454956055, "sampling/sampling_logp_difference/max": 1.390899658203125, "sampling/sampling_logp_difference/mean": 0.11382357776165009, "step": 179, "step_time": 8.093203769001775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.9663640707731247, "epoch": 0.0018, "grad_norm": 0.2810424566268921, "kl": 0.6517706848680973, "learning_rate": 9.999997601266627e-06, "loss": 0.0022, "step": 180, "step_time": 5.012043394999637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0059523810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0059523810632526875, "completions/clipped_ratio": 0.0625, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 333.90625, "completions/mean_terminated_length": 343.0333557128906, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0782750695943832, "epoch": 0.00181, "frac_reward_zero_std": 0.0, "grad_norm": 0.2106042057275772, "kl": 0.38070189394056797, "learning_rate": 9.999997567835209e-06, "loss": -0.0258, "num_tokens": 4037805.0, "reward": 0.8984715342521667, "reward_std": 3.050546646118164, "rewards/rollout_reward_func/mean": 0.8984715342521667, "rewards/rollout_reward_func/std": 3.374727964401245, "sampling/importance_sampling_ratio/max": 0.9107234477996826, "sampling/importance_sampling_ratio/mean": 0.6416457891464233, "sampling/importance_sampling_ratio/min": 0.009813060984015465, "sampling/sampling_logp_difference/max": 3.442042350769043, "sampling/sampling_logp_difference/mean": 0.14227959513664246, "step": 181, "step_time": 8.59453404299893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "entropy": 1.0471737086772919, "epoch": 0.00182, "grad_norm": 0.22333744168281555, "kl": 0.39696072041988373, "learning_rate": 9.999997534172434e-06, "loss": -0.0258, "step": 182, "step_time": 5.172055003002242 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 360.6875, "completions/mean_terminated_length": 360.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3404680714011192, "epoch": 0.00183, "frac_reward_zero_std": 0.0, "grad_norm": 0.5119001865386963, "kl": 0.839600995182991, "learning_rate": 9.999997500278298e-06, "loss": -0.0355, "num_tokens": 4088604.0, "reward": 1.6767258644104004, "reward_std": 3.0893173217773438, "rewards/rollout_reward_func/mean": 1.6767258644104004, "rewards/rollout_reward_func/std": 3.207479238510132, "sampling/importance_sampling_ratio/max": 0.8925432562828064, "sampling/importance_sampling_ratio/mean": 0.6253080368041992, "sampling/importance_sampling_ratio/min": 8.735227859858516e-19, "sampling/sampling_logp_difference/max": 16.013456344604492, "sampling/sampling_logp_difference/mean": 0.36809104681015015, "step": 183, "step_time": 8.733972622996589 }, { "clip_ratio/high_max": 0.029220780357718468, "clip_ratio/high_mean": 0.014610390178859234, "clip_ratio/low_mean": 0.014610390178859234, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029220780357718468, "entropy": 1.3180863931775093, "epoch": 0.00184, "grad_norm": 0.24217595160007477, "kl": 0.8397463150322437, "learning_rate": 9.999997466152803e-06, "loss": -0.0372, "step": 184, "step_time": 4.716836962999878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0113387166056782, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.0113387166056782, "completions/clipped_ratio": 0.0625, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 279.625, "completions/mean_terminated_length": 297.20001220703125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1348425224423409, "epoch": 0.00185, "frac_reward_zero_std": 0.0, "grad_norm": 0.4794715940952301, "kl": 0.747586615383625, "learning_rate": 9.999997431795949e-06, "loss": -0.0274, "num_tokens": 4135709.0, "reward": 1.276407241821289, "reward_std": 2.9116969108581543, "rewards/rollout_reward_func/mean": 1.276407241821289, "rewards/rollout_reward_func/std": 3.2596077919006348, "sampling/importance_sampling_ratio/max": 0.9243360161781311, "sampling/importance_sampling_ratio/mean": 0.6617485880851746, "sampling/importance_sampling_ratio/min": 9.847686651315803e-20, "sampling/sampling_logp_difference/max": 17.884483337402344, "sampling/sampling_logp_difference/mean": 0.5437909960746765, "step": 185, "step_time": 9.149337796998225 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.01974431867711246, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025994318770244718, "entropy": 1.1347833797335625, "epoch": 0.00186, "grad_norm": 0.16279926896095276, "kl": 0.7182964235544205, "learning_rate": 9.999997397207736e-06, "loss": -0.028, "step": 186, "step_time": 5.161401362000106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 357.96875, "completions/mean_terminated_length": 355.774169921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1380848065018654, "epoch": 0.00187, "frac_reward_zero_std": 0.0, "grad_norm": 0.5420560240745544, "kl": 0.6003437247127295, "learning_rate": 9.999997362388163e-06, "loss": -0.0455, "num_tokens": 4185433.0, "reward": 0.7068508863449097, "reward_std": 3.0392227172851562, "rewards/rollout_reward_func/mean": 0.7068508863449097, "rewards/rollout_reward_func/std": 3.2360968589782715, "sampling/importance_sampling_ratio/max": 0.9126468896865845, "sampling/importance_sampling_ratio/mean": 0.6868407130241394, "sampling/importance_sampling_ratio/min": 0.03512981906533241, "sampling/sampling_logp_difference/max": 2.663797616958618, "sampling/sampling_logp_difference/mean": 0.12905558943748474, "step": 187, "step_time": 8.93599180899946 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.02361111156642437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030555556062608957, "entropy": 1.1316656172275543, "epoch": 0.00188, "grad_norm": 0.4236421287059784, "kl": 0.657247063703835, "learning_rate": 9.999997327337232e-06, "loss": -0.0469, "step": 188, "step_time": 4.802313382000648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 117.5625, "completions/mean_terminated_length": 117.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7984849289059639, "epoch": 0.00189, "frac_reward_zero_std": 0.5, "grad_norm": 0.1155574694275856, "kl": 0.3184248134493828, "learning_rate": 9.99999729205494e-06, "loss": -0.0302, "num_tokens": 4224343.0, "reward": 2.0630064010620117, "reward_std": 1.779564380645752, "rewards/rollout_reward_func/mean": 2.0630064010620117, "rewards/rollout_reward_func/std": 3.0436079502105713, "sampling/importance_sampling_ratio/max": 0.9432691335678101, "sampling/importance_sampling_ratio/mean": 0.8364355564117432, "sampling/importance_sampling_ratio/min": 0.0507090762257576, "sampling/sampling_logp_difference/max": 1.5988984107971191, "sampling/sampling_logp_difference/mean": 0.08181546628475189, "step": 189, "step_time": 8.423016906002886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.7920808494091034, "epoch": 0.0019, "grad_norm": 0.11213742941617966, "kl": 0.33014224329963326, "learning_rate": 9.99999725654129e-06, "loss": -0.0301, "step": 190, "step_time": 4.437200784996094 }, { "clip_ratio/high_max": 0.0243055559694767, "clip_ratio/high_mean": 0.01215277798473835, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944729015231, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 454.21875, "completions/mean_terminated_length": 454.21875, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.0614012479782104, "epoch": 0.00191, "frac_reward_zero_std": 0.0, "grad_norm": 0.4600975513458252, "kl": 0.5152530409395695, "learning_rate": 9.999997220796281e-06, "loss": -0.0142, "num_tokens": 4279185.0, "reward": 0.8890839219093323, "reward_std": 2.8277111053466797, "rewards/rollout_reward_func/mean": 0.8890839219093323, "rewards/rollout_reward_func/std": 2.9869611263275146, "sampling/importance_sampling_ratio/max": 0.8680065870285034, "sampling/importance_sampling_ratio/mean": 0.6920762062072754, "sampling/importance_sampling_ratio/min": 4.2203895456972e-20, "sampling/sampling_logp_difference/max": 16.09724998474121, "sampling/sampling_logp_difference/mean": 0.47399020195007324, "step": 191, "step_time": 9.320632021001074 }, { "clip_ratio/high_max": 0.019097222480922937, "clip_ratio/high_mean": 0.009548611240461469, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944729015231, "entropy": 1.0711845234036446, "epoch": 0.00192, "grad_norm": 0.1867055743932724, "kl": 0.5323042664676905, "learning_rate": 9.999997184819913e-06, "loss": -0.0153, "step": 192, "step_time": 4.748838251000052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 234.125, "completions/mean_terminated_length": 234.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1020602136850357, "epoch": 0.00193, "frac_reward_zero_std": 0.25, "grad_norm": 0.4169532060623169, "kl": 0.758047066628933, "learning_rate": 9.999997148612186e-06, "loss": -0.0044, "num_tokens": 4323015.0, "reward": 2.0684828758239746, "reward_std": 2.020127773284912, "rewards/rollout_reward_func/mean": 2.0684828758239746, "rewards/rollout_reward_func/std": 2.868218183517456, "sampling/importance_sampling_ratio/max": 0.9295993447303772, "sampling/importance_sampling_ratio/mean": 0.734311580657959, "sampling/importance_sampling_ratio/min": 0.0015417675022035837, "sampling/sampling_logp_difference/max": 3.0371012687683105, "sampling/sampling_logp_difference/mean": 0.16195815801620483, "step": 193, "step_time": 8.840593473998524 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.0870562717318535, "epoch": 0.00194, "grad_norm": 0.1980658322572708, "kl": 0.6223065834492445, "learning_rate": 9.9999971121731e-06, "loss": -0.0049, "step": 194, "step_time": 4.6252472500000295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 123.1875, "completions/mean_terminated_length": 126.64515686035156, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6599768772721291, "epoch": 0.00195, "frac_reward_zero_std": 0.5, "grad_norm": 0.22544638812541962, "kl": 0.19908862141892314, "learning_rate": 9.999997075502653e-06, "loss": -0.0119, "num_tokens": 4357995.0, "reward": 2.501845359802246, "reward_std": 0.8407708406448364, "rewards/rollout_reward_func/mean": 2.501845359802246, "rewards/rollout_reward_func/std": 2.5005409717559814, "sampling/importance_sampling_ratio/max": 0.9312810301780701, "sampling/importance_sampling_ratio/mean": 0.8346703052520752, "sampling/importance_sampling_ratio/min": 0.0035605940502136946, "sampling/sampling_logp_difference/max": 2.109342575073242, "sampling/sampling_logp_difference/mean": 0.07571840286254883, "step": 195, "step_time": 8.050043121000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009785353671759367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009785353671759367, "entropy": 0.655437558889389, "epoch": 0.00196, "grad_norm": 0.14439822733402252, "kl": 0.1971259443089366, "learning_rate": 9.999997038600848e-06, "loss": -0.0122, "step": 196, "step_time": 4.226299927000582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 155.5625, "completions/mean_terminated_length": 145.40000915527344, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8626627437770367, "epoch": 0.00197, "frac_reward_zero_std": 0.5, "grad_norm": 0.12253255397081375, "kl": 0.2749085179530084, "learning_rate": 9.999997001467682e-06, "loss": -0.007, "num_tokens": 4394897.0, "reward": 0.9630972743034363, "reward_std": 0.8132377862930298, "rewards/rollout_reward_func/mean": 0.9630972743034363, "rewards/rollout_reward_func/std": 3.0222761631011963, "sampling/importance_sampling_ratio/max": 0.9335094690322876, "sampling/importance_sampling_ratio/mean": 0.7799872159957886, "sampling/importance_sampling_ratio/min": 0.002898112405091524, "sampling/sampling_logp_difference/max": 2.763084650039673, "sampling/sampling_logp_difference/mean": 0.13351495563983917, "step": 197, "step_time": 8.925236802002473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8579280860722065, "epoch": 0.00198, "grad_norm": 0.12197300791740417, "kl": 0.3110062386840582, "learning_rate": 9.99999696410316e-06, "loss": -0.007, "step": 198, "step_time": 4.647365123999407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 271.125, "completions/mean_terminated_length": 271.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1578101739287376, "epoch": 0.00199, "frac_reward_zero_std": 0.25, "grad_norm": 0.28647422790527344, "kl": 0.5752187334001064, "learning_rate": 9.999996926507279e-06, "loss": -0.0226, "num_tokens": 4439890.0, "reward": 2.023836612701416, "reward_std": 1.6840295791625977, "rewards/rollout_reward_func/mean": 2.023836612701416, "rewards/rollout_reward_func/std": 3.3191354274749756, "sampling/importance_sampling_ratio/max": 0.9309858679771423, "sampling/importance_sampling_ratio/mean": 0.7134063243865967, "sampling/importance_sampling_ratio/min": 2.0090324903261827e-18, "sampling/sampling_logp_difference/max": 21.622013092041016, "sampling/sampling_logp_difference/mean": 0.36890923976898193, "step": 199, "step_time": 9.122759366997343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1616975739598274, "epoch": 0.002, "grad_norm": 0.19404290616512299, "kl": 0.5596782006323338, "learning_rate": 9.999996888680038e-06, "loss": -0.0231, "step": 200, "step_time": 4.744486274001247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 246.90625, "completions/mean_terminated_length": 246.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.591596931219101, "epoch": 0.00201, "frac_reward_zero_std": 0.25, "grad_norm": 0.6494370698928833, "kl": 0.8919636253267527, "learning_rate": 9.999996850621436e-06, "loss": -0.0086, "num_tokens": 4483523.0, "reward": 1.1744418144226074, "reward_std": 2.249213695526123, "rewards/rollout_reward_func/mean": 1.1744418144226074, "rewards/rollout_reward_func/std": 3.357861042022705, "sampling/importance_sampling_ratio/max": 1.7491858005523682, "sampling/importance_sampling_ratio/mean": 0.8766854405403137, "sampling/importance_sampling_ratio/min": 0.629192590713501, "sampling/sampling_logp_difference/max": 0.7907724380493164, "sampling/sampling_logp_difference/mean": 0.05357443541288376, "step": 201, "step_time": 8.169212595001227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 0.5978072769939899, "epoch": 0.00202, "grad_norm": 0.17305338382720947, "kl": 0.8899145806208253, "learning_rate": 9.999996812331476e-06, "loss": -0.0098, "step": 202, "step_time": 4.962951425000938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 294.15625, "completions/mean_terminated_length": 286.51611328125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6201420500874519, "epoch": 0.00203, "frac_reward_zero_std": 0.25, "grad_norm": 0.43295058608055115, "kl": 0.5509616434574127, "learning_rate": 9.999996773810157e-06, "loss": -0.0218, "num_tokens": 4530034.0, "reward": 2.4974052906036377, "reward_std": 2.2990176677703857, "rewards/rollout_reward_func/mean": 2.4974052906036377, "rewards/rollout_reward_func/std": 3.054955244064331, "sampling/importance_sampling_ratio/max": 0.9481468796730042, "sampling/importance_sampling_ratio/mean": 0.8262695074081421, "sampling/importance_sampling_ratio/min": 0.19285765290260315, "sampling/sampling_logp_difference/max": 1.1609103679656982, "sampling/sampling_logp_difference/mean": 0.0485273078083992, "step": 203, "step_time": 8.375749794002331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "entropy": 0.6200508028268814, "epoch": 0.00204, "grad_norm": 0.33977633714675903, "kl": 0.548204233404249, "learning_rate": 9.99999673505748e-06, "loss": -0.0236, "step": 204, "step_time": 4.996100856000339 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 461.34375, "completions/mean_terminated_length": 461.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9644775167107582, "epoch": 0.00205, "frac_reward_zero_std": 0.0, "grad_norm": 0.6437802314758301, "kl": 1.0466136187314987, "learning_rate": 9.999996696073441e-06, "loss": -0.0438, "num_tokens": 4584331.0, "reward": -0.33409807085990906, "reward_std": 2.5986135005950928, "rewards/rollout_reward_func/mean": -0.33409807085990906, "rewards/rollout_reward_func/std": 2.8352653980255127, "sampling/importance_sampling_ratio/max": 1.8716126680374146, "sampling/importance_sampling_ratio/mean": 0.7733796834945679, "sampling/importance_sampling_ratio/min": 0.13008280098438263, "sampling/sampling_logp_difference/max": 1.920009970664978, "sampling/sampling_logp_difference/mean": 0.10025201737880707, "step": 205, "step_time": 8.679284527001073 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.021006944589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03350694477558136, "entropy": 0.9532321467995644, "epoch": 0.00206, "grad_norm": 0.2603062093257904, "kl": 1.0703542828559875, "learning_rate": 9.999996656858045e-06, "loss": -0.0452, "step": 206, "step_time": 5.21408338800029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 425.34375, "completions/mean_terminated_length": 425.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7860305085778236, "epoch": 0.00207, "frac_reward_zero_std": 0.0, "grad_norm": 0.38173598051071167, "kl": 0.8065525125712156, "learning_rate": 9.99999661741129e-06, "loss": -0.0276, "num_tokens": 4637062.0, "reward": 2.377408266067505, "reward_std": 3.018934726715088, "rewards/rollout_reward_func/mean": 2.377408266067505, "rewards/rollout_reward_func/std": 3.0682225227355957, "sampling/importance_sampling_ratio/max": 0.9007273316383362, "sampling/importance_sampling_ratio/mean": 0.7377510666847229, "sampling/importance_sampling_ratio/min": 0.08246096968650818, "sampling/sampling_logp_difference/max": 2.2741940021514893, "sampling/sampling_logp_difference/mean": 0.07524667680263519, "step": 207, "step_time": 8.87632065499929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.7914779782295227, "epoch": 0.00208, "grad_norm": 0.17640407383441925, "kl": 0.6877205707132816, "learning_rate": 9.999996577733175e-06, "loss": -0.0281, "step": 208, "step_time": 5.273058081000272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 185.78125, "completions/mean_terminated_length": 191.258056640625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6332475021481514, "epoch": 0.00209, "frac_reward_zero_std": 0.0, "grad_norm": 0.17283643782138824, "kl": 0.3301713680848479, "learning_rate": 9.9999965378237e-06, "loss": -0.0156, "num_tokens": 4678544.0, "reward": 1.657301425933838, "reward_std": 1.9066685438156128, "rewards/rollout_reward_func/mean": 1.657301425933838, "rewards/rollout_reward_func/std": 3.1961987018585205, "sampling/importance_sampling_ratio/max": 1.1795495748519897, "sampling/importance_sampling_ratio/mean": 0.8405283093452454, "sampling/importance_sampling_ratio/min": 0.01102153118699789, "sampling/sampling_logp_difference/max": 2.1818251609802246, "sampling/sampling_logp_difference/mean": 0.06732013076543808, "step": 209, "step_time": 8.590872510001645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6407940611243248, "epoch": 0.0021, "grad_norm": 0.1831606775522232, "kl": 0.33017597906291485, "learning_rate": 9.999996497682868e-06, "loss": -0.0158, "step": 210, "step_time": 5.169296244001089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018181818071752787, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.018181818071752787, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 462.9375, "completions/mean_terminated_length": 462.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1449619233608246, "epoch": 0.00211, "frac_reward_zero_std": 0.0, "grad_norm": 0.19633972644805908, "kl": 0.8435094319283962, "learning_rate": 9.999996457310676e-06, "loss": -0.0334, "num_tokens": 4731452.0, "reward": 0.9086183905601501, "reward_std": 2.6187591552734375, "rewards/rollout_reward_func/mean": 0.9086183905601501, "rewards/rollout_reward_func/std": 3.086013078689575, "sampling/importance_sampling_ratio/max": 0.8812808990478516, "sampling/importance_sampling_ratio/mean": 0.6238290071487427, "sampling/importance_sampling_ratio/min": 0.0178877841681242, "sampling/sampling_logp_difference/max": 3.0250370502471924, "sampling/sampling_logp_difference/mean": 0.15976926684379578, "step": 211, "step_time": 8.667569829000058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019744318444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019744318444281816, "entropy": 1.1249615177512169, "epoch": 0.00212, "grad_norm": 0.2565966546535492, "kl": 0.9167128168046474, "learning_rate": 9.999996416707125e-06, "loss": -0.0329, "step": 212, "step_time": 5.08248347800145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 343.53125, "completions/mean_terminated_length": 354.0967712402344, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8335337489843369, "epoch": 0.00213, "frac_reward_zero_std": 0.0, "grad_norm": 0.39169129729270935, "kl": 0.6693573538213968, "learning_rate": 9.999996375872214e-06, "loss": -0.0344, "num_tokens": 4779167.0, "reward": 1.6568021774291992, "reward_std": 2.4220218658447266, "rewards/rollout_reward_func/mean": 1.6568021774291992, "rewards/rollout_reward_func/std": 2.785282850265503, "sampling/importance_sampling_ratio/max": 0.9350288510322571, "sampling/importance_sampling_ratio/mean": 0.7424147129058838, "sampling/importance_sampling_ratio/min": 0.03014981932938099, "sampling/sampling_logp_difference/max": 1.5954999923706055, "sampling/sampling_logp_difference/mean": 0.09769216179847717, "step": 213, "step_time": 8.387192665000839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018876262940466404, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018876262940466404, "entropy": 0.8230449855327606, "epoch": 0.00214, "grad_norm": 0.13148921728134155, "kl": 0.6841633897274733, "learning_rate": 9.999996334805946e-06, "loss": -0.035, "step": 214, "step_time": 4.5258787459970335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 558.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 213.59375, "completions/mean_terminated_length": 213.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.696804404258728, "epoch": 0.00215, "frac_reward_zero_std": 0.5, "grad_norm": 0.11481602489948273, "kl": 0.5183723168447614, "learning_rate": 9.999996293508317e-06, "loss": -0.0038, "num_tokens": 4820478.0, "reward": 2.6149048805236816, "reward_std": 1.6366126537322998, "rewards/rollout_reward_func/mean": 2.6149048805236816, "rewards/rollout_reward_func/std": 2.5087056159973145, "sampling/importance_sampling_ratio/max": 1.5731240510940552, "sampling/importance_sampling_ratio/mean": 0.848854660987854, "sampling/importance_sampling_ratio/min": 0.007528090383857489, "sampling/sampling_logp_difference/max": 4.537612438201904, "sampling/sampling_logp_difference/mean": 0.08785554766654968, "step": 215, "step_time": 8.367950642999858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.703733965754509, "epoch": 0.00216, "grad_norm": 0.12742580473423004, "kl": 0.5174695188179612, "learning_rate": 9.999996251979329e-06, "loss": -0.0037, "step": 216, "step_time": 4.826893954999832 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012626262847334146, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 418.875, "completions/mean_terminated_length": 418.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0903236865997314, "epoch": 0.00217, "frac_reward_zero_std": 0.0, "grad_norm": 0.3176267743110657, "kl": 0.8902065325528383, "learning_rate": 9.999996210218981e-06, "loss": -0.0187, "num_tokens": 4873134.0, "reward": 1.0933709144592285, "reward_std": 2.3622922897338867, "rewards/rollout_reward_func/mean": 1.0933709144592285, "rewards/rollout_reward_func/std": 3.089589834213257, "sampling/importance_sampling_ratio/max": 1.355116367340088, "sampling/importance_sampling_ratio/mean": 0.7158704400062561, "sampling/importance_sampling_ratio/min": 0.09111077338457108, "sampling/sampling_logp_difference/max": 1.9858126640319824, "sampling/sampling_logp_difference/mean": 0.11868558824062347, "step": 217, "step_time": 8.718645267001193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019444444682449102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "entropy": 1.102194145321846, "epoch": 0.00218, "grad_norm": 0.4394472539424896, "kl": 0.9111283831298351, "learning_rate": 9.999996168227277e-06, "loss": -0.0193, "step": 218, "step_time": 4.786762224000995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 329.9375, "completions/mean_terminated_length": 328.8709716796875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0397082269191742, "epoch": 0.00219, "frac_reward_zero_std": 0.0, "grad_norm": 0.3301519751548767, "kl": 1.104004004970193, "learning_rate": 9.999996126004213e-06, "loss": -0.0334, "num_tokens": 4920448.0, "reward": 2.3233933448791504, "reward_std": 2.274972438812256, "rewards/rollout_reward_func/mean": 2.3233933448791504, "rewards/rollout_reward_func/std": 2.8106513023376465, "sampling/importance_sampling_ratio/max": 1.0304447412490845, "sampling/importance_sampling_ratio/mean": 0.69205242395401, "sampling/importance_sampling_ratio/min": 0.002680640434846282, "sampling/sampling_logp_difference/max": 2.6901865005493164, "sampling/sampling_logp_difference/mean": 0.1642218977212906, "step": 219, "step_time": 9.237330212998131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02083333395421505, "entropy": 1.0429672524333, "epoch": 0.0022, "grad_norm": 0.28947964310646057, "kl": 1.043170753866434, "learning_rate": 9.999996083549788e-06, "loss": -0.0333, "step": 220, "step_time": 4.70553132400164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011101973708719015, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 411.09375, "completions/mean_terminated_length": 411.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.32797771692276, "epoch": 0.00221, "frac_reward_zero_std": 0.0, "grad_norm": 0.5300762057304382, "kl": 1.8134417608380318, "learning_rate": 9.999996040864003e-06, "loss": -0.0003, "num_tokens": 4973489.0, "reward": 1.0456185340881348, "reward_std": 2.758831024169922, "rewards/rollout_reward_func/mean": 1.0456185340881348, "rewards/rollout_reward_func/std": 3.0662970542907715, "sampling/importance_sampling_ratio/max": 1.2787468433380127, "sampling/importance_sampling_ratio/mean": 0.620360791683197, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 2.767888069152832, "sampling/sampling_logp_difference/mean": 0.16791388392448425, "step": 221, "step_time": 9.32872241399673 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.02892743283882737, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03517743246629834, "entropy": 1.3044675290584564, "epoch": 0.00222, "grad_norm": 0.3626461923122406, "kl": 1.7157214432954788, "learning_rate": 9.999995997946861e-06, "loss": -0.0016, "step": 222, "step_time": 4.675181238000732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 740.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 373.71875, "completions/mean_terminated_length": 376.933349609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9747419282793999, "epoch": 0.00223, "frac_reward_zero_std": 0.0, "grad_norm": 0.4722682535648346, "kl": 0.7415418829768896, "learning_rate": 9.999995954798361e-06, "loss": 0.0017, "num_tokens": 5024393.0, "reward": 1.3244436979293823, "reward_std": 2.5439281463623047, "rewards/rollout_reward_func/mean": 1.3244436979293823, "rewards/rollout_reward_func/std": 2.8512392044067383, "sampling/importance_sampling_ratio/max": 1.4355765581130981, "sampling/importance_sampling_ratio/mean": 0.7522436380386353, "sampling/importance_sampling_ratio/min": 0.006714152172207832, "sampling/sampling_logp_difference/max": 3.15132212638855, "sampling/sampling_logp_difference/mean": 0.12884238362312317, "step": 223, "step_time": 9.057417474999966 }, { "clip_ratio/high_max": 0.03579192655161023, "clip_ratio/high_mean": 0.017895963275805116, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024145963368937373, "entropy": 0.941016424447298, "epoch": 0.00224, "grad_norm": 0.19660067558288574, "kl": 0.7271957108750939, "learning_rate": 9.9999959114185e-06, "loss": 0.0007, "step": 224, "step_time": 4.7449556950032274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 390.84375, "completions/mean_terminated_length": 390.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0824428796768188, "epoch": 0.00225, "frac_reward_zero_std": 0.0, "grad_norm": 0.5181706547737122, "kl": 0.9321527145802975, "learning_rate": 9.999995867807281e-06, "loss": -0.0049, "num_tokens": 5073359.0, "reward": 1.1813201904296875, "reward_std": 2.489523410797119, "rewards/rollout_reward_func/mean": 1.1813201904296875, "rewards/rollout_reward_func/std": 2.989685535430908, "sampling/importance_sampling_ratio/max": 1.0593928098678589, "sampling/importance_sampling_ratio/mean": 0.7365450859069824, "sampling/importance_sampling_ratio/min": 2.6850702852243558e-06, "sampling/sampling_logp_difference/max": 3.901078701019287, "sampling/sampling_logp_difference/mean": 0.16605928540229797, "step": 225, "step_time": 9.092311486003382 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.03415178554132581, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.04821428516879678, "entropy": 1.0631348565220833, "epoch": 0.00226, "grad_norm": 0.2644418478012085, "kl": 0.8894416689872742, "learning_rate": 9.999995823964702e-06, "loss": -0.0054, "step": 226, "step_time": 4.670969958000569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 478.0, "completions/max_terminated_length": 478.0, "completions/mean_length": 54.3125, "completions/mean_terminated_length": 54.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6244743056595325, "epoch": 0.00227, "frac_reward_zero_std": 0.5, "grad_norm": 0.04985964298248291, "kl": 0.3561143008992076, "learning_rate": 9.999995779890764e-06, "loss": -0.0113, "num_tokens": 5107682.0, "reward": 2.3361146450042725, "reward_std": 1.2487618923187256, "rewards/rollout_reward_func/mean": 2.3361146450042725, "rewards/rollout_reward_func/std": 2.8660035133361816, "sampling/importance_sampling_ratio/max": 0.9396150708198547, "sampling/importance_sampling_ratio/mean": 0.8577675223350525, "sampling/importance_sampling_ratio/min": 0.14647667109966278, "sampling/sampling_logp_difference/max": 1.725661039352417, "sampling/sampling_logp_difference/mean": 0.053017985075712204, "step": 227, "step_time": 7.844727080004304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6224922463297844, "epoch": 0.00228, "grad_norm": 0.04276252165436745, "kl": 0.3559771813452244, "learning_rate": 9.999995735585469e-06, "loss": -0.0114, "step": 228, "step_time": 4.060119945001134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 167.21875, "completions/mean_terminated_length": 157.5806427001953, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7033568359911442, "epoch": 0.00229, "frac_reward_zero_std": 0.0, "grad_norm": 0.2188216745853424, "kl": 0.4126767748966813, "learning_rate": 9.999995691048812e-06, "loss": -0.0055, "num_tokens": 5149287.0, "reward": 0.9356297254562378, "reward_std": 2.2680954933166504, "rewards/rollout_reward_func/mean": 0.9356297254562378, "rewards/rollout_reward_func/std": 3.428934097290039, "sampling/importance_sampling_ratio/max": 0.9407537579536438, "sampling/importance_sampling_ratio/mean": 0.803857147693634, "sampling/importance_sampling_ratio/min": 3.9144373444753707e-19, "sampling/sampling_logp_difference/max": 16.985416412353516, "sampling/sampling_logp_difference/mean": 0.2928555905818939, "step": 229, "step_time": 8.88192456700017 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 0.6913694590330124, "epoch": 0.0023, "grad_norm": 0.2133348435163498, "kl": 0.4004577547311783, "learning_rate": 9.999995646280798e-06, "loss": -0.0054, "step": 230, "step_time": 4.571844719002911 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 152.1875, "completions/mean_terminated_length": 152.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9427185505628586, "epoch": 0.00231, "frac_reward_zero_std": 0.0, "grad_norm": 0.3564832806587219, "kl": 0.6509192464873195, "learning_rate": 9.999995601281422e-06, "loss": -0.0535, "num_tokens": 5189927.0, "reward": -0.36012977361679077, "reward_std": 2.3448853492736816, "rewards/rollout_reward_func/mean": -0.36012977361679077, "rewards/rollout_reward_func/std": 3.4089276790618896, "sampling/importance_sampling_ratio/max": 1.0191775560379028, "sampling/importance_sampling_ratio/mean": 0.7913844585418701, "sampling/importance_sampling_ratio/min": 5.940558425900208e-15, "sampling/sampling_logp_difference/max": 22.44684600830078, "sampling/sampling_logp_difference/mean": 0.2765757739543915, "step": 231, "step_time": 8.439364128998932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.9410406872630119, "epoch": 0.00232, "grad_norm": 0.7466528415679932, "kl": 0.6674172235652804, "learning_rate": 9.99999555605069e-06, "loss": -0.0532, "step": 232, "step_time": 4.924921580000955 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017090548761188984, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 272.09375, "completions/mean_terminated_length": 272.09375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.975367046892643, "epoch": 0.00233, "frac_reward_zero_std": 0.0, "grad_norm": 0.19380918145179749, "kl": 0.46408181078732014, "learning_rate": 9.999995510588598e-06, "loss": 0.0031, "num_tokens": 5236386.0, "reward": -0.3273877501487732, "reward_std": 2.199934959411621, "rewards/rollout_reward_func/mean": -0.3273877501487732, "rewards/rollout_reward_func/std": 2.8254146575927734, "sampling/importance_sampling_ratio/max": 1.1314480304718018, "sampling/importance_sampling_ratio/mean": 0.7715117931365967, "sampling/importance_sampling_ratio/min": 0.0428294874727726, "sampling/sampling_logp_difference/max": 2.7417025566101074, "sampling/sampling_logp_difference/mean": 0.10019803047180176, "step": 233, "step_time": 8.763465290998283 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "entropy": 0.9615056216716766, "epoch": 0.00234, "grad_norm": 0.4442107081413269, "kl": 0.4545847736299038, "learning_rate": 9.999995464895147e-06, "loss": 0.0035, "step": 234, "step_time": 5.190924710004765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 443.21875, "completions/mean_terminated_length": 443.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0601079687476158, "epoch": 0.00235, "frac_reward_zero_std": 0.0, "grad_norm": 0.6337189078330994, "kl": 0.6684720553457737, "learning_rate": 9.999995418970336e-06, "loss": -0.0321, "num_tokens": 5289451.0, "reward": -0.017627760767936707, "reward_std": 2.878835916519165, "rewards/rollout_reward_func/mean": -0.017627760767936707, "rewards/rollout_reward_func/std": 2.990732431411743, "sampling/importance_sampling_ratio/max": 1.1733722686767578, "sampling/importance_sampling_ratio/mean": 0.7329757809638977, "sampling/importance_sampling_ratio/min": 0.010072683915495872, "sampling/sampling_logp_difference/max": 3.6382944583892822, "sampling/sampling_logp_difference/mean": 0.11709397286176682, "step": 235, "step_time": 8.839183864996812 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.020312500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026562500279396772, "entropy": 1.091833919286728, "epoch": 0.00236, "grad_norm": 0.34062838554382324, "kl": 0.6747406236827374, "learning_rate": 9.999995372814168e-06, "loss": -0.0339, "step": 236, "step_time": 5.271043752998594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 378.46875, "completions/mean_terminated_length": 372.7419128417969, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0634438320994377, "epoch": 0.00237, "frac_reward_zero_std": 0.25, "grad_norm": 0.30584099888801575, "kl": 0.6172224469482899, "learning_rate": 9.999995326426639e-06, "loss": -0.0056, "num_tokens": 5338499.0, "reward": 0.7893303036689758, "reward_std": 1.984004259109497, "rewards/rollout_reward_func/mean": 0.7893303036689758, "rewards/rollout_reward_func/std": 2.924335479736328, "sampling/importance_sampling_ratio/max": 0.9698473215103149, "sampling/importance_sampling_ratio/mean": 0.6748290657997131, "sampling/importance_sampling_ratio/min": 0.12582892179489136, "sampling/sampling_logp_difference/max": 1.6888145208358765, "sampling/sampling_logp_difference/mean": 0.10638640820980072, "step": 237, "step_time": 8.796076087999609 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.02814352815039456, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035087972646579146, "entropy": 1.1096309050917625, "epoch": 0.00238, "grad_norm": 0.2421712577342987, "kl": 0.636338334530592, "learning_rate": 9.999995279807751e-06, "loss": -0.0065, "step": 238, "step_time": 5.212786549003795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 214.125, "completions/mean_terminated_length": 214.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8898054957389832, "epoch": 0.00239, "frac_reward_zero_std": 0.25, "grad_norm": 0.15863069891929626, "kl": 0.3877704804763198, "learning_rate": 9.999995232957505e-06, "loss": -0.0161, "num_tokens": 5381919.0, "reward": 1.8909127712249756, "reward_std": 1.7119274139404297, "rewards/rollout_reward_func/mean": 1.8909127712249756, "rewards/rollout_reward_func/std": 3.188525438308716, "sampling/importance_sampling_ratio/max": 0.9419252276420593, "sampling/importance_sampling_ratio/mean": 0.789331316947937, "sampling/importance_sampling_ratio/min": 4.892266713268922e-12, "sampling/sampling_logp_difference/max": 9.62974739074707, "sampling/sampling_logp_difference/mean": 0.2322128415107727, "step": 239, "step_time": 8.57158091400197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.9197526350617409, "epoch": 0.0024, "grad_norm": 0.17590197920799255, "kl": 0.3974827779456973, "learning_rate": 9.999995185875899e-06, "loss": -0.0159, "step": 240, "step_time": 4.901875970001129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 372.34375, "completions/mean_terminated_length": 372.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.698855072259903, "epoch": 0.00241, "frac_reward_zero_std": 0.0, "grad_norm": 0.5035167932510376, "kl": 0.4244538666680455, "learning_rate": 9.999995138562934e-06, "loss": -0.0142, "num_tokens": 5432850.0, "reward": 0.5686291456222534, "reward_std": 2.9466123580932617, "rewards/rollout_reward_func/mean": 0.5686291456222534, "rewards/rollout_reward_func/std": 3.1917724609375, "sampling/importance_sampling_ratio/max": 0.913431704044342, "sampling/importance_sampling_ratio/mean": 0.7766443490982056, "sampling/importance_sampling_ratio/min": 0.2006615549325943, "sampling/sampling_logp_difference/max": 1.2009093761444092, "sampling/sampling_logp_difference/mean": 0.06370047479867935, "step": 241, "step_time": 8.566288219000853 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "entropy": 0.7021589577198029, "epoch": 0.00242, "grad_norm": 0.3871881663799286, "kl": 0.4062529103830457, "learning_rate": 9.99999509101861e-06, "loss": -0.0147, "step": 242, "step_time": 5.033190958998603 }, { "clip_ratio/high_max": 0.03333333367481828, "clip_ratio/high_mean": 0.01666666683740914, "clip_ratio/low_mean": 0.012620192021131516, "clip_ratio/low_min": 0.009615384973585606, "clip_ratio/region_mean": 0.029286858858540654, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 254.75, "completions/mean_terminated_length": 254.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4016404822468758, "epoch": 0.00243, "frac_reward_zero_std": 0.0, "grad_norm": 1.8030297756195068, "kl": 1.8377663809806108, "learning_rate": 9.99999504324293e-06, "loss": -0.0049, "num_tokens": 5477087.0, "reward": 0.46450889110565186, "reward_std": 2.221926689147949, "rewards/rollout_reward_func/mean": 0.46450889110565186, "rewards/rollout_reward_func/std": 3.231743335723877, "sampling/importance_sampling_ratio/max": 0.9397179484367371, "sampling/importance_sampling_ratio/mean": 0.5879740118980408, "sampling/importance_sampling_ratio/min": 4.0438856819671496e-21, "sampling/sampling_logp_difference/max": 17.312652587890625, "sampling/sampling_logp_difference/mean": 0.42104074358940125, "step": 243, "step_time": 8.872249901000032 }, { "clip_ratio/high_max": 0.03314394038170576, "clip_ratio/high_mean": 0.01657197019085288, "clip_ratio/low_mean": 0.01673951093107462, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0333114811219275, "entropy": 1.4285539761185646, "epoch": 0.00244, "grad_norm": 0.4145757853984833, "kl": 1.0108652003109455, "learning_rate": 9.999994995235887e-06, "loss": -0.0083, "step": 244, "step_time": 5.1906408769991685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013134058332070708, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013134058332070708, "completions/clipped_ratio": 0.03125, "completions/max_length": 563.0, "completions/max_terminated_length": 563.0, "completions/mean_length": 227.5625, "completions/mean_terminated_length": 221.32257080078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.006574347615242, "epoch": 0.00245, "frac_reward_zero_std": 0.25, "grad_norm": 0.832617998123169, "kl": 0.8071983605623245, "learning_rate": 9.999994946997486e-06, "loss": -0.0368, "num_tokens": 5518890.0, "reward": 2.723998546600342, "reward_std": 2.0626885890960693, "rewards/rollout_reward_func/mean": 2.723998546600342, "rewards/rollout_reward_func/std": 2.673961877822876, "sampling/importance_sampling_ratio/max": 2.474912643432617, "sampling/importance_sampling_ratio/mean": 0.812646746635437, "sampling/importance_sampling_ratio/min": 0.05182882398366928, "sampling/sampling_logp_difference/max": 2.2612545490264893, "sampling/sampling_logp_difference/mean": 0.14765185117721558, "step": 245, "step_time": 7.979239955002413 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02234848542138934, "entropy": 1.0274420976638794, "epoch": 0.00246, "grad_norm": 0.6836508512496948, "kl": 0.8305524252355099, "learning_rate": 9.999994898527727e-06, "loss": -0.0356, "step": 246, "step_time": 4.758873709002728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.03125, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 180.0625, "completions/mean_terminated_length": 169.258056640625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7786482572555542, "epoch": 0.00247, "frac_reward_zero_std": 0.5, "grad_norm": 0.560104489326477, "kl": 0.445658999029547, "learning_rate": 9.999994849826609e-06, "loss": -0.0212, "num_tokens": 5559938.0, "reward": 2.973130941390991, "reward_std": 1.615952968597412, "rewards/rollout_reward_func/mean": 2.973130941390991, "rewards/rollout_reward_func/std": 2.513164758682251, "sampling/importance_sampling_ratio/max": 1.1547808647155762, "sampling/importance_sampling_ratio/mean": 0.8357301950454712, "sampling/importance_sampling_ratio/min": 0.013528180308640003, "sampling/sampling_logp_difference/max": 1.7249716520309448, "sampling/sampling_logp_difference/mean": 0.08076245337724686, "step": 247, "step_time": 8.273009928998363 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.020312500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035937500186264515, "entropy": 0.7658293284475803, "epoch": 0.00248, "grad_norm": 0.1474018096923828, "kl": 0.30213791504502296, "learning_rate": 9.99999480089413e-06, "loss": -0.023, "step": 248, "step_time": 4.513893970997742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 572.0, "completions/max_terminated_length": 572.0, "completions/mean_length": 88.34375, "completions/mean_terminated_length": 90.67741394042969, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7756192833185196, "epoch": 0.00249, "frac_reward_zero_std": 0.25, "grad_norm": 0.11364078521728516, "kl": 0.1918597249314189, "learning_rate": 9.999994751730293e-06, "loss": -0.0396, "num_tokens": 5596972.0, "reward": 1.433401107788086, "reward_std": 1.9504826068878174, "rewards/rollout_reward_func/mean": 1.433401107788086, "rewards/rollout_reward_func/std": 3.2570269107818604, "sampling/importance_sampling_ratio/max": 0.9456012845039368, "sampling/importance_sampling_ratio/mean": 0.8297789096832275, "sampling/importance_sampling_ratio/min": 0.004721224308013916, "sampling/sampling_logp_difference/max": 1.956438422203064, "sampling/sampling_logp_difference/mean": 0.08855162560939789, "step": 249, "step_time": 8.168390324999564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011101974174380302, "clip_ratio/low_min": 0.00657894741743803, "clip_ratio/region_mean": 0.011101974174380302, "entropy": 0.7993477769196033, "epoch": 0.0025, "grad_norm": 0.09820455312728882, "kl": 0.18886340595781803, "learning_rate": 9.999994702335098e-06, "loss": -0.0398, "step": 250, "step_time": 4.677178109999659 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.01105769257992506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01887019257992506, "completions/clipped_ratio": 0.03125, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 319.96875, "completions/mean_terminated_length": 329.774169921875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.157932534813881, "epoch": 0.00251, "frac_reward_zero_std": 0.0, "grad_norm": 0.3469836711883545, "kl": 0.5096417739987373, "learning_rate": 9.999994652708542e-06, "loss": -0.0309, "num_tokens": 5644783.0, "reward": 2.3634238243103027, "reward_std": 2.729177951812744, "rewards/rollout_reward_func/mean": 2.3634238243103027, "rewards/rollout_reward_func/std": 2.8030002117156982, "sampling/importance_sampling_ratio/max": 0.938515841960907, "sampling/importance_sampling_ratio/mean": 0.6472117900848389, "sampling/importance_sampling_ratio/min": 0.0035327994264662266, "sampling/sampling_logp_difference/max": 4.0745720863342285, "sampling/sampling_logp_difference/mean": 0.1890113353729248, "step": 251, "step_time": 8.575856534002014 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.023557692766189575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.031370192766189575, "entropy": 1.1659194454550743, "epoch": 0.00252, "grad_norm": 0.28520727157592773, "kl": 0.5083718043752015, "learning_rate": 9.999994602850629e-06, "loss": -0.0319, "step": 252, "step_time": 4.731850936001138 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02638888917863369, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 250.6875, "completions/mean_terminated_length": 250.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0326822102069855, "epoch": 0.00253, "frac_reward_zero_std": 0.25, "grad_norm": 1.084718942642212, "kl": 0.5232477374374866, "learning_rate": 9.999994552761356e-06, "loss": -0.0449, "num_tokens": 5686089.0, "reward": 2.952908992767334, "reward_std": 2.1701648235321045, "rewards/rollout_reward_func/mean": 2.952908992767334, "rewards/rollout_reward_func/std": 2.510608434677124, "sampling/importance_sampling_ratio/max": 2.2361204624176025, "sampling/importance_sampling_ratio/mean": 0.8163533806800842, "sampling/importance_sampling_ratio/min": 0.03697160631418228, "sampling/sampling_logp_difference/max": 3.188304901123047, "sampling/sampling_logp_difference/mean": 0.1505456566810608, "step": 253, "step_time": 8.911108214000706 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.039168471936136484, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.0511002903804183, "entropy": 1.040377028286457, "epoch": 0.00254, "grad_norm": 0.23100261390209198, "kl": 0.5676680523902178, "learning_rate": 9.999994502440725e-06, "loss": -0.0469, "step": 254, "step_time": 4.612071822999496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 315.78125, "completions/mean_terminated_length": 315.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.804621234536171, "epoch": 0.00255, "frac_reward_zero_std": 0.0, "grad_norm": 0.4109407961368561, "kl": 0.45173647068440914, "learning_rate": 9.999994451888734e-06, "loss": -0.0284, "num_tokens": 5733568.0, "reward": -0.05570173263549805, "reward_std": 2.0378260612487793, "rewards/rollout_reward_func/mean": -0.05570173263549805, "rewards/rollout_reward_func/std": 3.2486650943756104, "sampling/importance_sampling_ratio/max": 0.9499958157539368, "sampling/importance_sampling_ratio/mean": 0.8004287481307983, "sampling/importance_sampling_ratio/min": 0.32037389278411865, "sampling/sampling_logp_difference/max": 0.6116167306900024, "sampling/sampling_logp_difference/mean": 0.05964513495564461, "step": 255, "step_time": 8.975341523999305 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.8132221773266792, "epoch": 0.00256, "grad_norm": 0.24318058788776398, "kl": 0.4551222585141659, "learning_rate": 9.999994401105384e-06, "loss": -0.0291, "step": 256, "step_time": 4.728663353000229 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 172.875, "completions/mean_terminated_length": 172.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0573903694748878, "epoch": 0.00257, "frac_reward_zero_std": 0.0, "grad_norm": 0.5027620792388916, "kl": 0.7045814846642315, "learning_rate": 9.999994350090677e-06, "loss": -0.0414, "num_tokens": 5772883.0, "reward": 0.5910930633544922, "reward_std": 2.1754002571105957, "rewards/rollout_reward_func/mean": 0.5910930633544922, "rewards/rollout_reward_func/std": 3.192164421081543, "sampling/importance_sampling_ratio/max": 0.9457374811172485, "sampling/importance_sampling_ratio/mean": 0.682003378868103, "sampling/importance_sampling_ratio/min": 0.048203688114881516, "sampling/sampling_logp_difference/max": 1.5156526565551758, "sampling/sampling_logp_difference/mean": 0.13624338805675507, "step": 257, "step_time": 8.146419198999865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022348484490066767, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.022348484490066767, "entropy": 1.1201009973883629, "epoch": 0.00258, "grad_norm": 0.47621870040893555, "kl": 0.7734484728425741, "learning_rate": 9.999994298844607e-06, "loss": -0.0421, "step": 258, "step_time": 4.90316144800272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 502.0, "completions/max_terminated_length": 502.0, "completions/mean_length": 95.71875, "completions/mean_terminated_length": 95.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9453914910554886, "epoch": 0.00259, "frac_reward_zero_std": 0.0, "grad_norm": 1.1252281665802002, "kl": 0.47489172872155905, "learning_rate": 9.999994247367181e-06, "loss": -0.0187, "num_tokens": 5809039.0, "reward": 3.6124603748321533, "reward_std": 1.4373304843902588, "rewards/rollout_reward_func/mean": 3.6124603748321533, "rewards/rollout_reward_func/std": 1.7972360849380493, "sampling/importance_sampling_ratio/max": 0.9416337609291077, "sampling/importance_sampling_ratio/mean": 0.791317343711853, "sampling/importance_sampling_ratio/min": 0.03774319589138031, "sampling/sampling_logp_difference/max": 2.044558048248291, "sampling/sampling_logp_difference/mean": 0.09322954714298248, "step": 259, "step_time": 7.9722597600029985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.9465107098221779, "epoch": 0.0026, "grad_norm": 0.2030881643295288, "kl": 0.45396813098341227, "learning_rate": 9.999994195658396e-06, "loss": -0.0208, "step": 260, "step_time": 4.20441282900174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016035353764891624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016035353764891624, "completions/clipped_ratio": 0.03125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 344.40625, "completions/mean_terminated_length": 341.5483703613281, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0333124995231628, "epoch": 0.00261, "frac_reward_zero_std": 0.0, "grad_norm": 0.5835732817649841, "kl": 0.6103949435055256, "learning_rate": 9.99999414371825e-06, "loss": -0.0309, "num_tokens": 5858359.0, "reward": 1.0202463865280151, "reward_std": 2.234804153442383, "rewards/rollout_reward_func/mean": 1.0202463865280151, "rewards/rollout_reward_func/std": 3.2133331298828125, "sampling/importance_sampling_ratio/max": 1.6625686883926392, "sampling/importance_sampling_ratio/mean": 0.7532814741134644, "sampling/importance_sampling_ratio/min": 0.008588896133005619, "sampling/sampling_logp_difference/max": 1.9650648832321167, "sampling/sampling_logp_difference/mean": 0.11663717031478882, "step": 261, "step_time": 8.600739835004788 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.0333676785230637, "epoch": 0.00262, "grad_norm": 0.3511929512023926, "kl": 0.6109492182731628, "learning_rate": 9.999994091546746e-06, "loss": -0.0344, "step": 262, "step_time": 5.016338672998245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 264.59375, "completions/mean_terminated_length": 264.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9870308637619019, "epoch": 0.00263, "frac_reward_zero_std": 0.25, "grad_norm": 0.13751406967639923, "kl": 0.8554112408310175, "learning_rate": 9.999994039143884e-06, "loss": -0.0215, "num_tokens": 5903830.0, "reward": 1.7483291625976562, "reward_std": 1.3167343139648438, "rewards/rollout_reward_func/mean": 1.7483291625976562, "rewards/rollout_reward_func/std": 2.978395462036133, "sampling/importance_sampling_ratio/max": 0.949575662612915, "sampling/importance_sampling_ratio/mean": 0.7358896732330322, "sampling/importance_sampling_ratio/min": 0.017244644463062286, "sampling/sampling_logp_difference/max": 2.5228652954101562, "sampling/sampling_logp_difference/mean": 0.1259002983570099, "step": 263, "step_time": 9.165114806002748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.0008277595043182, "epoch": 0.00264, "grad_norm": 0.14379505813121796, "kl": 0.8976229894906282, "learning_rate": 9.999993986509661e-06, "loss": -0.0219, "step": 264, "step_time": 4.705621683997379 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.027383207343518734, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03519570687785745, "completions/clipped_ratio": 0.03125, "completions/max_length": 690.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 422.28125, "completions/mean_terminated_length": 413.6451416015625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8573571294546127, "epoch": 0.00265, "frac_reward_zero_std": 0.0, "grad_norm": 0.4734410047531128, "kl": 1.0021406337618828, "learning_rate": 9.99999393364408e-06, "loss": -0.0228, "num_tokens": 5953359.0, "reward": 0.1295800805091858, "reward_std": 1.109178900718689, "rewards/rollout_reward_func/mean": 0.1295800805091858, "rewards/rollout_reward_func/std": 2.598655939102173, "sampling/importance_sampling_ratio/max": 1.1636399030685425, "sampling/importance_sampling_ratio/mean": 0.44592708349227905, "sampling/importance_sampling_ratio/min": 2.0785929514691225e-19, "sampling/sampling_logp_difference/max": 17.872182846069336, "sampling/sampling_logp_difference/mean": 0.47747522592544556, "step": 265, "step_time": 8.839034740996794 }, { "clip_ratio/high_max": 0.04829545505344868, "clip_ratio/high_mean": 0.02414772752672434, "clip_ratio/low_mean": 0.03420138917863369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05834911670535803, "entropy": 1.8225839734077454, "epoch": 0.00266, "grad_norm": 0.34745097160339355, "kl": 1.0433971285820007, "learning_rate": 9.99999388054714e-06, "loss": -0.0234, "step": 266, "step_time": 5.155485016999592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 339.65625, "completions/mean_terminated_length": 339.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2140556648373604, "epoch": 0.00267, "frac_reward_zero_std": 0.0, "grad_norm": 0.34068918228149414, "kl": 1.6153830215334892, "learning_rate": 9.999993827218844e-06, "loss": -0.019, "num_tokens": 6001426.0, "reward": 1.7001194953918457, "reward_std": 2.4564149379730225, "rewards/rollout_reward_func/mean": 1.7001194953918457, "rewards/rollout_reward_func/std": 3.0035080909729004, "sampling/importance_sampling_ratio/max": 0.9476580619812012, "sampling/importance_sampling_ratio/mean": 0.6919848918914795, "sampling/importance_sampling_ratio/min": 0.04233410209417343, "sampling/sampling_logp_difference/max": 1.9820016622543335, "sampling/sampling_logp_difference/mean": 0.15136206150054932, "step": 267, "step_time": 8.654840761999367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2215594351291656, "epoch": 0.00268, "grad_norm": 0.29086604714393616, "kl": 1.521490816026926, "learning_rate": 9.999993773659186e-06, "loss": -0.0199, "step": 268, "step_time": 5.1179811880010675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 609.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 301.75, "completions/mean_terminated_length": 301.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4383197128772736, "epoch": 0.00269, "frac_reward_zero_std": 0.25, "grad_norm": 0.282607764005661, "kl": 0.6833417741581798, "learning_rate": 9.99999371986817e-06, "loss": -0.0282, "num_tokens": 6046241.0, "reward": 1.8613629341125488, "reward_std": 2.5841994285583496, "rewards/rollout_reward_func/mean": 1.8613629341125488, "rewards/rollout_reward_func/std": 3.159687042236328, "sampling/importance_sampling_ratio/max": 0.943653404712677, "sampling/importance_sampling_ratio/mean": 0.6239719390869141, "sampling/importance_sampling_ratio/min": 0.013730796054005623, "sampling/sampling_logp_difference/max": 3.1467316150665283, "sampling/sampling_logp_difference/mean": 0.18483370542526245, "step": 269, "step_time": 8.284967962001247 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.01785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03348214365541935, "entropy": 1.4366602376103401, "epoch": 0.0027, "grad_norm": 0.2546578645706177, "kl": 0.6184361074119806, "learning_rate": 9.999993665845795e-06, "loss": -0.0282, "step": 270, "step_time": 4.453523497000788 }, { "clip_ratio/high_max": 0.06666666828095913, "clip_ratio/high_mean": 0.033333334140479565, "clip_ratio/low_mean": 0.017658730503171682, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05099206417798996, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 326.25, "completions/mean_terminated_length": 326.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7063374519348145, "epoch": 0.00271, "frac_reward_zero_std": 0.25, "grad_norm": 0.26623648405075073, "kl": 1.168674549087882, "learning_rate": 9.99999361159206e-06, "loss": -0.0181, "num_tokens": 6092899.0, "reward": 2.206629514694214, "reward_std": 2.454211950302124, "rewards/rollout_reward_func/mean": 2.206629514694214, "rewards/rollout_reward_func/std": 2.9601597785949707, "sampling/importance_sampling_ratio/max": 0.943114697933197, "sampling/importance_sampling_ratio/mean": 0.5480748414993286, "sampling/importance_sampling_ratio/min": 0.010389690287411213, "sampling/sampling_logp_difference/max": 3.259459972381592, "sampling/sampling_logp_difference/mean": 0.23253273963928223, "step": 271, "step_time": 8.489878422999027 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.015873016323894262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.034623016603291035, "entropy": 1.706713743507862, "epoch": 0.00272, "grad_norm": 0.2723273038864136, "kl": 0.9772176686674356, "learning_rate": 9.999993557106966e-06, "loss": -0.0187, "step": 272, "step_time": 5.025800926001466 }, { "clip_ratio/high_max": 0.040277778171002865, "clip_ratio/high_mean": 0.020138889085501432, "clip_ratio/low_mean": 0.020138889085501432, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.040277778171002865, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 445.65625, "completions/mean_terminated_length": 445.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.743712604045868, "epoch": 0.00273, "frac_reward_zero_std": 0.0, "grad_norm": 0.8967099189758301, "kl": 0.7792079821228981, "learning_rate": 9.999993502390513e-06, "loss": -0.019, "num_tokens": 6146331.0, "reward": 2.981555938720703, "reward_std": 2.384906053543091, "rewards/rollout_reward_func/mean": 2.981555938720703, "rewards/rollout_reward_func/std": 2.577519655227661, "sampling/importance_sampling_ratio/max": 1.0936769247055054, "sampling/importance_sampling_ratio/mean": 0.5653995871543884, "sampling/importance_sampling_ratio/min": 0.01296173594892025, "sampling/sampling_logp_difference/max": 2.2478551864624023, "sampling/sampling_logp_difference/mean": 0.23288431763648987, "step": 273, "step_time": 8.722157500998946 }, { "clip_ratio/high_max": 0.07916666753590107, "clip_ratio/high_mean": 0.052083334885537624, "clip_ratio/low_mean": 0.03333333367481828, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0854166685603559, "entropy": 1.6415338963270187, "epoch": 0.00274, "grad_norm": 0.6186096668243408, "kl": 0.703574638813734, "learning_rate": 9.999993447442701e-06, "loss": -0.0211, "step": 274, "step_time": 4.721231017001628 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 578.0, "completions/max_terminated_length": 578.0, "completions/mean_length": 324.25, "completions/mean_terminated_length": 324.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1202365159988403, "epoch": 0.00275, "frac_reward_zero_std": 0.0, "grad_norm": 0.4132913649082184, "kl": 0.7133209994062781, "learning_rate": 9.999993392263533e-06, "loss": -0.0082, "num_tokens": 6194110.0, "reward": 2.012208938598633, "reward_std": 2.9668045043945312, "rewards/rollout_reward_func/mean": 2.012208938598633, "rewards/rollout_reward_func/std": 3.2477328777313232, "sampling/importance_sampling_ratio/max": 0.9480411410331726, "sampling/importance_sampling_ratio/mean": 0.6694742441177368, "sampling/importance_sampling_ratio/min": 0.10730009526014328, "sampling/sampling_logp_difference/max": 2.103628635406494, "sampling/sampling_logp_difference/mean": 0.11591170728206635, "step": 275, "step_time": 8.708906328998637 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.0936727598309517, "epoch": 0.00276, "grad_norm": 0.4280955493450165, "kl": 0.7038591541349888, "learning_rate": 9.999993336853004e-06, "loss": -0.0096, "step": 276, "step_time": 4.923111268997673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 658.0, "completions/max_terminated_length": 658.0, "completions/mean_length": 295.3125, "completions/mean_terminated_length": 295.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.269017070531845, "epoch": 0.00277, "frac_reward_zero_std": 0.0, "grad_norm": 0.3465777635574341, "kl": 0.5512423366308212, "learning_rate": 9.999993281211115e-06, "loss": -0.0386, "num_tokens": 6241726.0, "reward": 2.561351776123047, "reward_std": 2.8475379943847656, "rewards/rollout_reward_func/mean": 2.561351776123047, "rewards/rollout_reward_func/std": 2.9850032329559326, "sampling/importance_sampling_ratio/max": 0.9370158314704895, "sampling/importance_sampling_ratio/mean": 0.6872305870056152, "sampling/importance_sampling_ratio/min": 6.371164280745746e-18, "sampling/sampling_logp_difference/max": 13.449613571166992, "sampling/sampling_logp_difference/mean": 0.35379520058631897, "step": 277, "step_time": 8.498274056999435 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.02130681835114956, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04214015230536461, "entropy": 1.2586323991417885, "epoch": 0.00278, "grad_norm": 0.2981734573841095, "kl": 0.5661062421277165, "learning_rate": 9.999993225337867e-06, "loss": -0.0393, "step": 278, "step_time": 4.568460558997685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 476.40625, "completions/mean_terminated_length": 476.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2805391550064087, "epoch": 0.00279, "frac_reward_zero_std": 0.0, "grad_norm": 0.2731775641441345, "kl": 0.5625610928982496, "learning_rate": 9.999993169233261e-06, "loss": -0.0384, "num_tokens": 6295653.0, "reward": 2.487842559814453, "reward_std": 2.571054458618164, "rewards/rollout_reward_func/mean": 2.487842559814453, "rewards/rollout_reward_func/std": 2.906182289123535, "sampling/importance_sampling_ratio/max": 0.9788100123405457, "sampling/importance_sampling_ratio/mean": 0.6318943500518799, "sampling/importance_sampling_ratio/min": 0.05079323425889015, "sampling/sampling_logp_difference/max": 2.663686513900757, "sampling/sampling_logp_difference/mean": 0.1377306580543518, "step": 279, "step_time": 8.992766154000492 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 1.257255919277668, "epoch": 0.0028, "grad_norm": 0.26493239402770996, "kl": 0.5640338426455855, "learning_rate": 9.999993112897298e-06, "loss": -0.0385, "step": 280, "step_time": 5.118240418998539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 406.59375, "completions/mean_terminated_length": 406.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2438434585928917, "epoch": 0.00281, "frac_reward_zero_std": 0.0, "grad_norm": 0.5090510249137878, "kl": 0.630475003272295, "learning_rate": 9.999993056329973e-06, "loss": -0.0164, "num_tokens": 6347512.0, "reward": 1.407324194908142, "reward_std": 3.029348373413086, "rewards/rollout_reward_func/mean": 1.407324194908142, "rewards/rollout_reward_func/std": 2.9507081508636475, "sampling/importance_sampling_ratio/max": 0.9518560171127319, "sampling/importance_sampling_ratio/mean": 0.6885493397712708, "sampling/importance_sampling_ratio/min": 0.1303435117006302, "sampling/sampling_logp_difference/max": 1.271589994430542, "sampling/sampling_logp_difference/mean": 0.11511921882629395, "step": 281, "step_time": 8.834557136999138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.2538617625832558, "epoch": 0.00282, "grad_norm": 0.35590478777885437, "kl": 0.6415127087384462, "learning_rate": 9.999992999531291e-06, "loss": -0.0169, "step": 282, "step_time": 4.84116095400168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.03125, "completions/max_length": 554.0, "completions/max_terminated_length": 554.0, "completions/mean_length": 148.4375, "completions/mean_terminated_length": 152.7096710205078, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8982963003218174, "epoch": 0.00283, "frac_reward_zero_std": 0.0, "grad_norm": 0.23308521509170532, "kl": 0.31338395085185766, "learning_rate": 9.99999294250125e-06, "loss": -0.0331, "num_tokens": 6387792.0, "reward": 1.6175565719604492, "reward_std": 2.3343920707702637, "rewards/rollout_reward_func/mean": 1.6175565719604492, "rewards/rollout_reward_func/std": 3.1909255981445312, "sampling/importance_sampling_ratio/max": 0.9518018960952759, "sampling/importance_sampling_ratio/mean": 0.7945637106895447, "sampling/importance_sampling_ratio/min": 0.007692177314311266, "sampling/sampling_logp_difference/max": 1.6230413913726807, "sampling/sampling_logp_difference/mean": 0.10479216277599335, "step": 283, "step_time": 7.900667785001133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.881880410015583, "epoch": 0.00284, "grad_norm": 0.22240985929965973, "kl": 0.308293673209846, "learning_rate": 9.99999288523985e-06, "loss": -0.0332, "step": 284, "step_time": 5.132565325999167 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 188.0625, "completions/mean_terminated_length": 188.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9174739196896553, "epoch": 0.00285, "frac_reward_zero_std": 0.5, "grad_norm": 0.3105739951133728, "kl": 0.6092890836298466, "learning_rate": 9.99999282774709e-06, "loss": -0.0282, "num_tokens": 6428540.0, "reward": 3.0046041011810303, "reward_std": 1.6731159687042236, "rewards/rollout_reward_func/mean": 3.0046041011810303, "rewards/rollout_reward_func/std": 2.5362093448638916, "sampling/importance_sampling_ratio/max": 1.126554250717163, "sampling/importance_sampling_ratio/mean": 0.8041306734085083, "sampling/importance_sampling_ratio/min": 0.04273172467947006, "sampling/sampling_logp_difference/max": 2.6994082927703857, "sampling/sampling_logp_difference/mean": 0.10078239440917969, "step": 285, "step_time": 8.109903709000719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 0.9324377104640007, "epoch": 0.00286, "grad_norm": 0.20476345717906952, "kl": 0.650788351893425, "learning_rate": 9.999992770022972e-06, "loss": -0.0283, "step": 286, "step_time": 4.631748609001079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 564.0, "completions/max_terminated_length": 564.0, "completions/mean_length": 238.25, "completions/mean_terminated_length": 238.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.283954180777073, "epoch": 0.00287, "frac_reward_zero_std": 0.0, "grad_norm": 0.3409591615200043, "kl": 0.6840553786605597, "learning_rate": 9.999992712067494e-06, "loss": -0.0211, "num_tokens": 6473530.0, "reward": 0.7038171291351318, "reward_std": 2.589984655380249, "rewards/rollout_reward_func/mean": 0.7038171291351318, "rewards/rollout_reward_func/std": 3.312828779220581, "sampling/importance_sampling_ratio/max": 0.9591862559318542, "sampling/importance_sampling_ratio/mean": 0.6574411392211914, "sampling/importance_sampling_ratio/min": 4.013203226602085e-18, "sampling/sampling_logp_difference/max": 14.284692764282227, "sampling/sampling_logp_difference/mean": 0.3218368887901306, "step": 287, "step_time": 8.402859163999892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2769583985209465, "epoch": 0.00288, "grad_norm": 0.3245112895965576, "kl": 0.674145869910717, "learning_rate": 9.999992653880659e-06, "loss": -0.0212, "step": 288, "step_time": 4.721520084996882 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 299.59375, "completions/mean_terminated_length": 299.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2095577716827393, "epoch": 0.00289, "frac_reward_zero_std": 0.25, "grad_norm": 0.20679379999637604, "kl": 0.7103812508285046, "learning_rate": 9.999992595462465e-06, "loss": -0.0276, "num_tokens": 6520814.0, "reward": -0.4763104021549225, "reward_std": 1.9860165119171143, "rewards/rollout_reward_func/mean": -0.4763104021549225, "rewards/rollout_reward_func/std": 3.1034886837005615, "sampling/importance_sampling_ratio/max": 1.2486522197723389, "sampling/importance_sampling_ratio/mean": 0.7088731527328491, "sampling/importance_sampling_ratio/min": 0.08989165723323822, "sampling/sampling_logp_difference/max": 2.2850112915039062, "sampling/sampling_logp_difference/mean": 0.1448763906955719, "step": 289, "step_time": 8.932830636002109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.230702318251133, "epoch": 0.0029, "grad_norm": 0.23287411034107208, "kl": 0.7173266708850861, "learning_rate": 9.999992536812911e-06, "loss": -0.0279, "step": 290, "step_time": 4.65920409499995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 434.65625, "completions/mean_terminated_length": 434.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.422908529639244, "epoch": 0.00291, "frac_reward_zero_std": 0.0, "grad_norm": 0.34177932143211365, "kl": 1.3301945850253105, "learning_rate": 9.999992477931997e-06, "loss": -0.0545, "num_tokens": 6574567.0, "reward": 2.4166245460510254, "reward_std": 2.774304151535034, "rewards/rollout_reward_func/mean": 2.4166245460510254, "rewards/rollout_reward_func/std": 3.036388874053955, "sampling/importance_sampling_ratio/max": 0.938244640827179, "sampling/importance_sampling_ratio/mean": 0.6133846044540405, "sampling/importance_sampling_ratio/min": 0.021478045731782913, "sampling/sampling_logp_difference/max": 3.835418224334717, "sampling/sampling_logp_difference/mean": 0.1695854365825653, "step": 291, "step_time": 8.755824967998706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.4360708594322205, "epoch": 0.00292, "grad_norm": 0.29311031103134155, "kl": 1.2234554439783096, "learning_rate": 9.999992418819726e-06, "loss": -0.0554, "step": 292, "step_time": 5.120455827000114 }, { "clip_ratio/high_max": 0.03348214365541935, "clip_ratio/high_mean": 0.016741071827709675, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03149801632389426, "completions/clipped_ratio": 0.03125, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 299.0625, "completions/mean_terminated_length": 292.8709716796875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2132456824183464, "epoch": 0.00293, "frac_reward_zero_std": 0.0, "grad_norm": 0.3056657016277313, "kl": 1.2852410282939672, "learning_rate": 9.999992359476096e-06, "loss": -0.0365, "num_tokens": 6622431.0, "reward": 0.06782114505767822, "reward_std": 3.1312201023101807, "rewards/rollout_reward_func/mean": 0.06782114505767822, "rewards/rollout_reward_func/std": 3.2783267498016357, "sampling/importance_sampling_ratio/max": 0.9519522190093994, "sampling/importance_sampling_ratio/mean": 0.6665964126586914, "sampling/importance_sampling_ratio/min": 0.065802201628685, "sampling/sampling_logp_difference/max": 2.345064640045166, "sampling/sampling_logp_difference/mean": 0.15461769700050354, "step": 293, "step_time": 9.233303066001099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020438762847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020438762847334146, "entropy": 1.2242946550250053, "epoch": 0.00294, "grad_norm": 0.32024022936820984, "kl": 1.3372684195637703, "learning_rate": 9.999992299901106e-06, "loss": -0.0365, "step": 294, "step_time": 4.703244152997286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 114.84375, "completions/mean_terminated_length": 114.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8073603995144367, "epoch": 0.00295, "frac_reward_zero_std": 0.5, "grad_norm": 0.3634255826473236, "kl": 0.53195755276829, "learning_rate": 9.999992240094759e-06, "loss": -0.0035, "num_tokens": 6659071.0, "reward": 2.1976139545440674, "reward_std": 0.6806360483169556, "rewards/rollout_reward_func/mean": 2.1976139545440674, "rewards/rollout_reward_func/std": 3.2114827632904053, "sampling/importance_sampling_ratio/max": 0.987760603427887, "sampling/importance_sampling_ratio/mean": 0.8440335988998413, "sampling/importance_sampling_ratio/min": 0.07392389327287674, "sampling/sampling_logp_difference/max": 1.6726475954055786, "sampling/sampling_logp_difference/mean": 0.08373713493347168, "step": 295, "step_time": 7.9659479609999835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7986013628542423, "epoch": 0.00296, "grad_norm": 0.4718514084815979, "kl": 0.5225900402292609, "learning_rate": 9.99999218005705e-06, "loss": -0.0039, "step": 296, "step_time": 4.958723788004136 }, { "clip_ratio/high_max": 0.040277778171002865, "clip_ratio/high_mean": 0.020138889085501432, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020138889085501432, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 175.28125, "completions/mean_terminated_length": 175.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.125465951859951, "epoch": 0.00297, "frac_reward_zero_std": 0.0, "grad_norm": 0.5667382478713989, "kl": 0.5224557435140014, "learning_rate": 9.999992119787986e-06, "loss": -0.0345, "num_tokens": 6699903.0, "reward": -0.3050542175769806, "reward_std": 2.0547308921813965, "rewards/rollout_reward_func/mean": -0.3050542175769806, "rewards/rollout_reward_func/std": 3.145312547683716, "sampling/importance_sampling_ratio/max": 1.4604625701904297, "sampling/importance_sampling_ratio/mean": 0.7498864531517029, "sampling/importance_sampling_ratio/min": 0.06686928868293762, "sampling/sampling_logp_difference/max": 2.4729974269866943, "sampling/sampling_logp_difference/mean": 0.15167781710624695, "step": 297, "step_time": 8.87612645900117 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.02083333395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027777778450399637, "entropy": 1.108258344233036, "epoch": 0.00298, "grad_norm": 0.191434845328331, "kl": 0.6396890887990594, "learning_rate": 9.999992059287562e-06, "loss": -0.0364, "step": 298, "step_time": 4.687386869996772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 434.78125, "completions/mean_terminated_length": 434.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2563556507229805, "epoch": 0.00299, "frac_reward_zero_std": 0.0, "grad_norm": 0.7138544321060181, "kl": 1.523621667176485, "learning_rate": 9.999991998555777e-06, "loss": -0.0605, "num_tokens": 6752361.0, "reward": 1.8076213598251343, "reward_std": 2.5622105598449707, "rewards/rollout_reward_func/mean": 1.8076213598251343, "rewards/rollout_reward_func/std": 2.9938526153564453, "sampling/importance_sampling_ratio/max": 1.4809743165969849, "sampling/importance_sampling_ratio/mean": 0.6713437438011169, "sampling/importance_sampling_ratio/min": 0.035837024450302124, "sampling/sampling_logp_difference/max": 2.015321731567383, "sampling/sampling_logp_difference/mean": 0.15239575505256653, "step": 299, "step_time": 8.427319927995995 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125000046566129, "entropy": 1.2527592852711678, "epoch": 0.003, "grad_norm": 0.3882288634777069, "kl": 1.5343394242227077, "learning_rate": 9.999991937592636e-06, "loss": -0.0612, "step": 300, "step_time": 4.61445397100033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 129.5625, "completions/mean_terminated_length": 133.22579956054688, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7377376966178417, "epoch": 0.00301, "frac_reward_zero_std": 0.5, "grad_norm": 0.1814500093460083, "kl": 0.2595238443464041, "learning_rate": 9.999991876398134e-06, "loss": -0.0073, "num_tokens": 6789295.0, "reward": 3.5824995040893555, "reward_std": 0.818490743637085, "rewards/rollout_reward_func/mean": 3.5824995040893555, "rewards/rollout_reward_func/std": 1.184607744216919, "sampling/importance_sampling_ratio/max": 0.9562745690345764, "sampling/importance_sampling_ratio/mean": 0.8157243132591248, "sampling/importance_sampling_ratio/min": 0.0035754707641899586, "sampling/sampling_logp_difference/max": 2.0729238986968994, "sampling/sampling_logp_difference/mean": 0.09530050307512283, "step": 301, "step_time": 8.747147670999766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.72634581848979, "epoch": 0.00302, "grad_norm": 0.18956787884235382, "kl": 0.2619403158314526, "learning_rate": 9.999991814972274e-06, "loss": -0.0075, "step": 302, "step_time": 4.379188247999991 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 739.0, "completions/max_terminated_length": 739.0, "completions/mean_length": 308.875, "completions/mean_terminated_length": 308.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2605941444635391, "epoch": 0.00303, "frac_reward_zero_std": 0.0, "grad_norm": 0.27148571610450745, "kl": 1.2831708751618862, "learning_rate": 9.999991753315055e-06, "loss": -0.021, "num_tokens": 6837541.0, "reward": -0.5418508648872375, "reward_std": 2.1882576942443848, "rewards/rollout_reward_func/mean": -0.5418508648872375, "rewards/rollout_reward_func/std": 2.7370846271514893, "sampling/importance_sampling_ratio/max": 0.9549022912979126, "sampling/importance_sampling_ratio/mean": 0.6726999282836914, "sampling/importance_sampling_ratio/min": 0.0017292930278927088, "sampling/sampling_logp_difference/max": 2.2815518379211426, "sampling/sampling_logp_difference/mean": 0.16357730329036713, "step": 303, "step_time": 8.938086881002164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0243055559694767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0243055559694767, "entropy": 1.289270929992199, "epoch": 0.00304, "grad_norm": 0.29616039991378784, "kl": 1.2488783709704876, "learning_rate": 9.999991691426477e-06, "loss": -0.0209, "step": 304, "step_time": 4.7804413179965195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0034722222480922937, "completions/clipped_ratio": 0.0, "completions/max_length": 628.0, "completions/max_terminated_length": 628.0, "completions/mean_length": 425.0625, "completions/mean_terminated_length": 425.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1632800996303558, "epoch": 0.00305, "frac_reward_zero_std": 0.0, "grad_norm": 0.27577245235443115, "kl": 1.0137687772512436, "learning_rate": 9.99999162930654e-06, "loss": -0.0341, "num_tokens": 6890497.0, "reward": 1.0022096633911133, "reward_std": 2.2561392784118652, "rewards/rollout_reward_func/mean": 1.0022096633911133, "rewards/rollout_reward_func/std": 3.4691808223724365, "sampling/importance_sampling_ratio/max": 1.0166676044464111, "sampling/importance_sampling_ratio/mean": 0.6474533081054688, "sampling/importance_sampling_ratio/min": 1.4797157368764274e-14, "sampling/sampling_logp_difference/max": 14.534453392028809, "sampling/sampling_logp_difference/mean": 0.28563255071640015, "step": 305, "step_time": 8.971302084002673 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.008680555736646056, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014930555829778314, "entropy": 1.164705142378807, "epoch": 0.00306, "grad_norm": 0.2760099768638611, "kl": 1.0242262370884418, "learning_rate": 9.999991566955245e-06, "loss": -0.0344, "step": 306, "step_time": 5.1122860409977875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 350.4375, "completions/mean_terminated_length": 350.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0010123401880264, "epoch": 0.00307, "frac_reward_zero_std": 0.25, "grad_norm": 0.2866983115673065, "kl": 1.4937348738312721, "learning_rate": 9.99999150437259e-06, "loss": -0.007, "num_tokens": 6939779.0, "reward": 0.5330899953842163, "reward_std": 2.141327381134033, "rewards/rollout_reward_func/mean": 0.5330899953842163, "rewards/rollout_reward_func/std": 3.344370126724243, "sampling/importance_sampling_ratio/max": 0.9448097348213196, "sampling/importance_sampling_ratio/mean": 0.7070366144180298, "sampling/importance_sampling_ratio/min": 0.08833281695842743, "sampling/sampling_logp_difference/max": 2.2239770889282227, "sampling/sampling_logp_difference/mean": 0.1097252368927002, "step": 307, "step_time": 8.77261546200134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.0067647770047188, "epoch": 0.00308, "grad_norm": 0.28827545046806335, "kl": 1.4904240295290947, "learning_rate": 9.999991441558578e-06, "loss": -0.0077, "step": 308, "step_time": 4.726597969998693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015230429824441671, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.015230429824441671, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 496.65625, "completions/mean_terminated_length": 496.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3137786835432053, "epoch": 0.00309, "frac_reward_zero_std": 0.0, "grad_norm": 0.15998758375644684, "kl": 1.1303712874650955, "learning_rate": 9.999991378513206e-06, "loss": -0.0377, "num_tokens": 6994995.0, "reward": 0.4924342632293701, "reward_std": 1.3365092277526855, "rewards/rollout_reward_func/mean": 0.4924342632293701, "rewards/rollout_reward_func/std": 3.0396032333374023, "sampling/importance_sampling_ratio/max": 0.9047294855117798, "sampling/importance_sampling_ratio/mean": 0.604641854763031, "sampling/importance_sampling_ratio/min": 3.9186891396617017e-22, "sampling/sampling_logp_difference/max": 18.546510696411133, "sampling/sampling_logp_difference/mean": 0.4132049083709717, "step": 309, "step_time": 9.499631661999956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017834596801549196, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.017834596801549196, "entropy": 1.3157368302345276, "epoch": 0.0031, "grad_norm": 0.16967971622943878, "kl": 1.1454923674464226, "learning_rate": 9.999991315236475e-06, "loss": -0.0375, "step": 310, "step_time": 5.266234352000538 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 368.6875, "completions/mean_terminated_length": 368.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0832163766026497, "epoch": 0.00311, "frac_reward_zero_std": 0.25, "grad_norm": 0.4350118637084961, "kl": 0.8704713974148035, "learning_rate": 9.999991251728386e-06, "loss": -0.0323, "num_tokens": 7044753.0, "reward": 2.5178463459014893, "reward_std": 2.1322760581970215, "rewards/rollout_reward_func/mean": 2.5178463459014893, "rewards/rollout_reward_func/std": 2.5156900882720947, "sampling/importance_sampling_ratio/max": 0.9560317397117615, "sampling/importance_sampling_ratio/mean": 0.69179368019104, "sampling/importance_sampling_ratio/min": 0.0383094847202301, "sampling/sampling_logp_difference/max": 2.7407920360565186, "sampling/sampling_logp_difference/mean": 0.1286538988351822, "step": 311, "step_time": 8.807167393000782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "entropy": 1.1142248138785362, "epoch": 0.00312, "grad_norm": 0.26570335030555725, "kl": 0.8434884808957577, "learning_rate": 9.999991187988938e-06, "loss": -0.0334, "step": 312, "step_time": 4.731497896002111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 328.9375, "completions/mean_terminated_length": 328.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0695917159318924, "epoch": 0.00313, "frac_reward_zero_std": 0.0, "grad_norm": 0.3997785449028015, "kl": 1.589432917535305, "learning_rate": 9.999991124018132e-06, "loss": -0.0275, "num_tokens": 7092828.0, "reward": 0.7597368359565735, "reward_std": 1.6265523433685303, "rewards/rollout_reward_func/mean": 0.7597368359565735, "rewards/rollout_reward_func/std": 3.5006399154663086, "sampling/importance_sampling_ratio/max": 1.2475653886795044, "sampling/importance_sampling_ratio/mean": 0.7292286157608032, "sampling/importance_sampling_ratio/min": 0.052878912538290024, "sampling/sampling_logp_difference/max": 1.3963781595230103, "sampling/sampling_logp_difference/mean": 0.1223495751619339, "step": 313, "step_time": 9.340078863000599 }, { "clip_ratio/high_max": 0.025252525694668293, "clip_ratio/high_mean": 0.012626262847334146, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02512626303359866, "entropy": 1.081346333026886, "epoch": 0.00314, "grad_norm": 0.26545223593711853, "kl": 1.5609169341623783, "learning_rate": 9.999991059815965e-06, "loss": -0.029, "step": 314, "step_time": 5.183379144000355 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020833333488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 435.09375, "completions/mean_terminated_length": 435.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4997138753533363, "epoch": 0.00315, "frac_reward_zero_std": 0.0, "grad_norm": 0.47175511717796326, "kl": 1.5258920937776566, "learning_rate": 9.99999099538244e-06, "loss": -0.0625, "num_tokens": 7145632.0, "reward": 2.1584973335266113, "reward_std": 3.105316162109375, "rewards/rollout_reward_func/mean": 2.1584973335266113, "rewards/rollout_reward_func/std": 3.2799761295318604, "sampling/importance_sampling_ratio/max": 0.9035847783088684, "sampling/importance_sampling_ratio/mean": 0.5573515892028809, "sampling/importance_sampling_ratio/min": 2.820124125543777e-12, "sampling/sampling_logp_difference/max": 13.163662910461426, "sampling/sampling_logp_difference/mean": 0.3000519275665283, "step": 315, "step_time": 8.977940113998557 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.032404891680926085, "clip_ratio/low_min": 0.005434782709926367, "clip_ratio/region_mean": 0.040217391680926085, "entropy": 1.535951890051365, "epoch": 0.00316, "grad_norm": 0.2271748036146164, "kl": 1.6846736446022987, "learning_rate": 9.999990930717558e-06, "loss": -0.0624, "step": 316, "step_time": 4.776798355998835 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022123016882687807, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 352.40625, "completions/mean_terminated_length": 352.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1450757756829262, "epoch": 0.00317, "frac_reward_zero_std": 0.25, "grad_norm": 0.23714756965637207, "kl": 1.4189636707305908, "learning_rate": 9.999990865821316e-06, "loss": -0.029, "num_tokens": 7194433.0, "reward": 2.403735876083374, "reward_std": 1.7558982372283936, "rewards/rollout_reward_func/mean": 2.403735876083374, "rewards/rollout_reward_func/std": 2.5461816787719727, "sampling/importance_sampling_ratio/max": 1.0542691946029663, "sampling/importance_sampling_ratio/mean": 0.674139142036438, "sampling/importance_sampling_ratio/min": 0.01513923704624176, "sampling/sampling_logp_difference/max": 3.9987633228302, "sampling/sampling_logp_difference/mean": 0.14549928903579712, "step": 317, "step_time": 9.000739829994927 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "entropy": 1.1605610623955727, "epoch": 0.00318, "grad_norm": 0.25970175862312317, "kl": 1.3669251203536987, "learning_rate": 9.999990800693715e-06, "loss": -0.0293, "step": 318, "step_time": 5.197681843001192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02638888917863369, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02638888917863369, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 276.84375, "completions/mean_terminated_length": 276.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4664596319198608, "epoch": 0.00319, "frac_reward_zero_std": 0.25, "grad_norm": 0.7450628876686096, "kl": 2.775124542415142, "learning_rate": 9.999990735334755e-06, "loss": -0.0293, "num_tokens": 7239934.0, "reward": 1.310276985168457, "reward_std": 2.2127788066864014, "rewards/rollout_reward_func/mean": 1.310276985168457, "rewards/rollout_reward_func/std": 2.9030959606170654, "sampling/importance_sampling_ratio/max": 1.7016165256500244, "sampling/importance_sampling_ratio/mean": 0.613946795463562, "sampling/importance_sampling_ratio/min": 0.015952659770846367, "sampling/sampling_logp_difference/max": 3.7042934894561768, "sampling/sampling_logp_difference/mean": 0.21600094437599182, "step": 319, "step_time": 9.679862493996552 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.019444444682449102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03333333367481828, "entropy": 1.4751283451914787, "epoch": 0.0032, "grad_norm": 0.2960353493690491, "kl": 2.3329054936766624, "learning_rate": 9.999990669744437e-06, "loss": -0.032, "step": 320, "step_time": 4.855518603995733 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 348.3125, "completions/mean_terminated_length": 348.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0822884142398834, "epoch": 0.00321, "frac_reward_zero_std": 0.0, "grad_norm": 0.6044095754623413, "kl": 1.1581818647682667, "learning_rate": 9.999990603922758e-06, "loss": 0.0125, "num_tokens": 7289191.0, "reward": 1.5331259965896606, "reward_std": 2.8640379905700684, "rewards/rollout_reward_func/mean": 1.5331259965896606, "rewards/rollout_reward_func/std": 3.3952603340148926, "sampling/importance_sampling_ratio/max": 0.9666628837585449, "sampling/importance_sampling_ratio/mean": 0.715779185295105, "sampling/importance_sampling_ratio/min": 0.08979030698537827, "sampling/sampling_logp_difference/max": 1.887528896331787, "sampling/sampling_logp_difference/mean": 0.1109558716416359, "step": 321, "step_time": 9.285168304999388 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.016666667070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0285984855145216, "entropy": 1.0630452632904053, "epoch": 0.00322, "grad_norm": 0.2595117688179016, "kl": 1.130724586546421, "learning_rate": 9.999990537869723e-06, "loss": 0.0114, "step": 322, "step_time": 5.385023473001638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 224.34375, "completions/mean_terminated_length": 224.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3478436022996902, "epoch": 0.00323, "frac_reward_zero_std": 0.25, "grad_norm": 0.31599918007850647, "kl": 1.51488646119833, "learning_rate": 9.999990471585328e-06, "loss": -0.0221, "num_tokens": 7332189.0, "reward": 1.6236566305160522, "reward_std": 1.7300677299499512, "rewards/rollout_reward_func/mean": 1.6236566305160522, "rewards/rollout_reward_func/std": 3.0101611614227295, "sampling/importance_sampling_ratio/max": 1.318284034729004, "sampling/importance_sampling_ratio/mean": 0.667516827583313, "sampling/importance_sampling_ratio/min": 0.007190053351223469, "sampling/sampling_logp_difference/max": 2.2528367042541504, "sampling/sampling_logp_difference/mean": 0.1958891600370407, "step": 323, "step_time": 9.397529666000992 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018229166977107525, "entropy": 1.3304093480110168, "epoch": 0.00324, "grad_norm": 0.1714465320110321, "kl": 1.3717603906989098, "learning_rate": 9.999990405069573e-06, "loss": -0.0224, "step": 324, "step_time": 4.801522709998608 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.033749999944120646, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04069444444030523, "completions/clipped_ratio": 0.03125, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 357.53125, "completions/mean_terminated_length": 350.9032287597656, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4599971249699593, "epoch": 0.00325, "frac_reward_zero_std": 0.0, "grad_norm": 3.9628076553344727, "kl": 11.049746975302696, "learning_rate": 9.999990338322462e-06, "loss": -0.0361, "num_tokens": 7382199.0, "reward": 1.1101906299591064, "reward_std": 3.030630350112915, "rewards/rollout_reward_func/mean": 1.1101906299591064, "rewards/rollout_reward_func/std": 3.0851893424987793, "sampling/importance_sampling_ratio/max": 0.9504741430282593, "sampling/importance_sampling_ratio/mean": 0.5427262783050537, "sampling/importance_sampling_ratio/min": 0.019177205860614777, "sampling/sampling_logp_difference/max": 2.505143880844116, "sampling/sampling_logp_difference/mean": 0.18973317742347717, "step": 325, "step_time": 9.03653855399898 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.01638888893648982, "clip_ratio/low_min": 0.004999999888241291, "clip_ratio/region_mean": 0.030277778394520283, "entropy": 1.4424382224678993, "epoch": 0.00326, "grad_norm": 0.5519586801528931, "kl": 2.0867833867669106, "learning_rate": 9.999990271343991e-06, "loss": -0.052, "step": 326, "step_time": 5.36251250299938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009785353671759367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009785353671759367, "completions/clipped_ratio": 0.03125, "completions/max_length": 593.0, "completions/max_terminated_length": 580.0, "completions/mean_length": 128.46875, "completions/mean_terminated_length": 113.4838638305664, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7953133136034012, "epoch": 0.00327, "frac_reward_zero_std": 0.25, "grad_norm": 0.3358995020389557, "kl": 0.9013867378234863, "learning_rate": 9.999990204134161e-06, "loss": -0.0061, "num_tokens": 7422234.0, "reward": 1.0667448043823242, "reward_std": 1.4552998542785645, "rewards/rollout_reward_func/mean": 1.0667448043823242, "rewards/rollout_reward_func/std": 3.3068857192993164, "sampling/importance_sampling_ratio/max": 0.9600101709365845, "sampling/importance_sampling_ratio/mean": 0.7684547901153564, "sampling/importance_sampling_ratio/min": 0.037790194153785706, "sampling/sampling_logp_difference/max": 1.8852750062942505, "sampling/sampling_logp_difference/mean": 0.09411997348070145, "step": 327, "step_time": 8.821932632004973 }, { "clip_ratio/high_max": 0.026988636702299118, "clip_ratio/high_mean": 0.013494318351149559, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019176136702299118, "entropy": 0.8072605617344379, "epoch": 0.00328, "grad_norm": 0.31088271737098694, "kl": 0.8958633840084076, "learning_rate": 9.999990136692973e-06, "loss": -0.0057, "step": 328, "step_time": 4.538892223996299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00941985659301281, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00941985659301281, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 146.90625, "completions/mean_terminated_length": 146.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.429712675511837, "epoch": 0.00329, "frac_reward_zero_std": 0.25, "grad_norm": 0.29741981625556946, "kl": 0.6284697549417615, "learning_rate": 9.999990069020426e-06, "loss": -0.0248, "num_tokens": 7462770.0, "reward": 1.1189593076705933, "reward_std": 2.149778366088867, "rewards/rollout_reward_func/mean": 1.1189593076705933, "rewards/rollout_reward_func/std": 3.1824352741241455, "sampling/importance_sampling_ratio/max": 0.9584074020385742, "sampling/importance_sampling_ratio/mean": 0.6754931211471558, "sampling/importance_sampling_ratio/min": 1.143703858885858e-18, "sampling/sampling_logp_difference/max": 13.64044189453125, "sampling/sampling_logp_difference/mean": 0.4281071126461029, "step": 329, "step_time": 8.512813580997317 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.02007326576858759, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027017710730433464, "entropy": 1.4395379684865475, "epoch": 0.0033, "grad_norm": 0.1904386729001999, "kl": 0.6252710497938097, "learning_rate": 9.99999000111652e-06, "loss": -0.026, "step": 330, "step_time": 5.112539192996337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 314.03125, "completions/mean_terminated_length": 314.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8704364821314812, "epoch": 0.00331, "frac_reward_zero_std": 0.25, "grad_norm": 0.19349834322929382, "kl": 0.5716027114540339, "learning_rate": 9.999989932981256e-06, "loss": -0.0464, "num_tokens": 7510437.0, "reward": 2.9201135635375977, "reward_std": 2.1698341369628906, "rewards/rollout_reward_func/mean": 2.9201135635375977, "rewards/rollout_reward_func/std": 2.6788415908813477, "sampling/importance_sampling_ratio/max": 0.9579988121986389, "sampling/importance_sampling_ratio/mean": 0.7627006769180298, "sampling/importance_sampling_ratio/min": 0.13992050290107727, "sampling/sampling_logp_difference/max": 1.373105764389038, "sampling/sampling_logp_difference/mean": 0.08131727576255798, "step": 331, "step_time": 9.707577700997717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8847976550459862, "epoch": 0.00332, "grad_norm": 0.1846991926431656, "kl": 0.5763975717127323, "learning_rate": 9.999989864614631e-06, "loss": -0.0467, "step": 332, "step_time": 4.917261233002137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 612.0, "completions/max_terminated_length": 612.0, "completions/mean_length": 392.5625, "completions/mean_terminated_length": 392.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2622144967317581, "epoch": 0.00333, "frac_reward_zero_std": 0.0, "grad_norm": 0.43359243869781494, "kl": 0.8749797884374857, "learning_rate": 9.99998979601665e-06, "loss": -0.0504, "num_tokens": 7562682.0, "reward": 1.7909235954284668, "reward_std": 3.1867592334747314, "rewards/rollout_reward_func/mean": 1.7909235954284668, "rewards/rollout_reward_func/std": 3.3396952152252197, "sampling/importance_sampling_ratio/max": 0.9615586996078491, "sampling/importance_sampling_ratio/mean": 0.6173836588859558, "sampling/importance_sampling_ratio/min": 0.017176300287246704, "sampling/sampling_logp_difference/max": 2.4193484783172607, "sampling/sampling_logp_difference/mean": 0.14959940314292908, "step": 333, "step_time": 8.760458889999427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018876262940466404, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018876262940466404, "entropy": 1.3224172368645668, "epoch": 0.00334, "grad_norm": 0.31497886776924133, "kl": 0.8944477625191212, "learning_rate": 9.99998972718731e-06, "loss": -0.0518, "step": 334, "step_time": 4.694594546999724 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 523.0, "completions/max_terminated_length": 523.0, "completions/mean_length": 192.4375, "completions/mean_terminated_length": 192.4375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0700028017163277, "epoch": 0.00335, "frac_reward_zero_std": 0.0, "grad_norm": 0.40497449040412903, "kl": 0.6460456512868404, "learning_rate": 9.99998965812661e-06, "loss": -0.0412, "num_tokens": 7606021.0, "reward": 1.3145744800567627, "reward_std": 2.2505240440368652, "rewards/rollout_reward_func/mean": 1.3145744800567627, "rewards/rollout_reward_func/std": 3.523637294769287, "sampling/importance_sampling_ratio/max": 0.9749676585197449, "sampling/importance_sampling_ratio/mean": 0.7361294031143188, "sampling/importance_sampling_ratio/min": 0.006743552628904581, "sampling/sampling_logp_difference/max": 2.4272561073303223, "sampling/sampling_logp_difference/mean": 0.12050535529851913, "step": 335, "step_time": 9.285989787998915 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.1178304702043533, "epoch": 0.00336, "grad_norm": 0.40962737798690796, "kl": 0.6296624038368464, "learning_rate": 9.999989588834551e-06, "loss": -0.0421, "step": 336, "step_time": 4.390089510996404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 595.0, "completions/max_terminated_length": 595.0, "completions/mean_length": 228.34375, "completions/mean_terminated_length": 228.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2292742803692818, "epoch": 0.00337, "frac_reward_zero_std": 0.5, "grad_norm": 0.18985813856124878, "kl": 1.0107364878058434, "learning_rate": 9.999989519311134e-06, "loss": -0.0104, "num_tokens": 7649830.0, "reward": 0.27222931385040283, "reward_std": 0.8957093954086304, "rewards/rollout_reward_func/mean": 0.27222931385040283, "rewards/rollout_reward_func/std": 3.2821221351623535, "sampling/importance_sampling_ratio/max": 0.9555379748344421, "sampling/importance_sampling_ratio/mean": 0.6962318420410156, "sampling/importance_sampling_ratio/min": 0.06199948862195015, "sampling/sampling_logp_difference/max": 2.664698839187622, "sampling/sampling_logp_difference/mean": 0.14021211862564087, "step": 337, "step_time": 8.727195794999716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2939593344926834, "epoch": 0.00338, "grad_norm": 0.2789892852306366, "kl": 1.1181061044335365, "learning_rate": 9.99998944955636e-06, "loss": -0.01, "step": 338, "step_time": 4.659629089999726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 291.875, "completions/mean_terminated_length": 291.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.815903678536415, "epoch": 0.00339, "frac_reward_zero_std": 0.0, "grad_norm": 0.3311612904071808, "kl": 0.8965452266857028, "learning_rate": 9.999989379570225e-06, "loss": -0.0053, "num_tokens": 7696706.0, "reward": -0.010229945182800293, "reward_std": 2.281690835952759, "rewards/rollout_reward_func/mean": -0.010229945182800293, "rewards/rollout_reward_func/std": 3.1100056171417236, "sampling/importance_sampling_ratio/max": 0.9477086663246155, "sampling/importance_sampling_ratio/mean": 0.5322908163070679, "sampling/importance_sampling_ratio/min": 0.10391397774219513, "sampling/sampling_logp_difference/max": 1.9370012283325195, "sampling/sampling_logp_difference/mean": 0.22836047410964966, "step": 339, "step_time": 9.773741537002934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01406249962747097, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.01406249962747097, "entropy": 1.824380874633789, "epoch": 0.0034, "grad_norm": 0.34231898188591003, "kl": 0.9129719231277704, "learning_rate": 9.999989309352732e-06, "loss": -0.0058, "step": 340, "step_time": 4.805583634994036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 355.03125, "completions/mean_terminated_length": 349.0322570800781, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1396376863121986, "epoch": 0.00341, "frac_reward_zero_std": 0.25, "grad_norm": 0.20432353019714355, "kl": 0.43887635599821806, "learning_rate": 9.99998923890388e-06, "loss": -0.0163, "num_tokens": 7744966.0, "reward": 1.1184982061386108, "reward_std": 1.507020354270935, "rewards/rollout_reward_func/mean": 1.1184982061386108, "rewards/rollout_reward_func/std": 2.7034482955932617, "sampling/importance_sampling_ratio/max": 0.9559841752052307, "sampling/importance_sampling_ratio/mean": 0.6500717401504517, "sampling/importance_sampling_ratio/min": 0.07981700450181961, "sampling/sampling_logp_difference/max": 1.985783576965332, "sampling/sampling_logp_difference/mean": 0.1327638179063797, "step": 341, "step_time": 9.0049888750018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1634071618318558, "epoch": 0.00342, "grad_norm": 0.1927398443222046, "kl": 0.43610176909714937, "learning_rate": 9.99998916822367e-06, "loss": -0.017, "step": 342, "step_time": 4.80037299399919 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 321.0625, "completions/mean_terminated_length": 321.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9735719412565231, "epoch": 0.00343, "frac_reward_zero_std": 0.25, "grad_norm": 0.2517951726913452, "kl": 1.2916007488965988, "learning_rate": 9.999989097312101e-06, "loss": -0.024, "num_tokens": 7791626.0, "reward": 2.8435845375061035, "reward_std": 2.263068914413452, "rewards/rollout_reward_func/mean": 2.8435845375061035, "rewards/rollout_reward_func/std": 2.6851603984832764, "sampling/importance_sampling_ratio/max": 0.9561312794685364, "sampling/importance_sampling_ratio/mean": 0.7120996713638306, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 1.6570864915847778, "sampling/sampling_logp_difference/mean": 0.09686991572380066, "step": 343, "step_time": 9.283606968998356 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 1.007620207965374, "epoch": 0.00344, "grad_norm": 0.23913979530334473, "kl": 1.3158229663968086, "learning_rate": 9.999989026169173e-06, "loss": -0.0246, "step": 344, "step_time": 5.346575039000527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 353.59375, "completions/mean_terminated_length": 353.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.593155562877655, "epoch": 0.00345, "frac_reward_zero_std": 0.0, "grad_norm": 0.32435905933380127, "kl": 0.8365911543369293, "learning_rate": 9.999988954794887e-06, "loss": -0.0325, "num_tokens": 7840340.0, "reward": -0.8252513408660889, "reward_std": 2.4104456901550293, "rewards/rollout_reward_func/mean": -0.8252513408660889, "rewards/rollout_reward_func/std": 2.50799560546875, "sampling/importance_sampling_ratio/max": 0.9371523261070251, "sampling/importance_sampling_ratio/mean": 0.5303835272789001, "sampling/importance_sampling_ratio/min": 0.008989367634057999, "sampling/sampling_logp_difference/max": 3.1486265659332275, "sampling/sampling_logp_difference/mean": 0.21743834018707275, "step": 345, "step_time": 9.067110759999196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.6456250846385956, "epoch": 0.00346, "grad_norm": 0.3221431374549866, "kl": 0.8605208173394203, "learning_rate": 9.999988883189243e-06, "loss": -0.0331, "step": 346, "step_time": 4.849617085999853 }, { "clip_ratio/high_max": 0.04261363670229912, "clip_ratio/high_mean": 0.02130681835114956, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03267045505344868, "completions/clipped_ratio": 0.0, "completions/max_length": 704.0, "completions/max_terminated_length": 704.0, "completions/mean_length": 341.34375, "completions/mean_terminated_length": 341.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.670075997710228, "epoch": 0.00347, "frac_reward_zero_std": 0.0, "grad_norm": 0.35448339581489563, "kl": 1.5255089309066534, "learning_rate": 9.999988811352238e-06, "loss": -0.0326, "num_tokens": 7889542.0, "reward": 0.48783400654792786, "reward_std": 2.6190638542175293, "rewards/rollout_reward_func/mean": 0.48783400654792786, "rewards/rollout_reward_func/std": 3.310515880584717, "sampling/importance_sampling_ratio/max": 1.1019651889801025, "sampling/importance_sampling_ratio/mean": 0.5956394672393799, "sampling/importance_sampling_ratio/min": 0.005619381088763475, "sampling/sampling_logp_difference/max": 3.1324667930603027, "sampling/sampling_logp_difference/mean": 0.2383347451686859, "step": 347, "step_time": 9.5726181930022 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035369318444281816, "entropy": 1.695484735071659, "epoch": 0.00348, "grad_norm": 0.3075660765171051, "kl": 1.443421021103859, "learning_rate": 9.999988739283876e-06, "loss": -0.0345, "step": 348, "step_time": 5.489794801000244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01578282844275236, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01578282844275236, "completions/clipped_ratio": 0.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 403.34375, "completions/mean_terminated_length": 403.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.4794982969760895, "epoch": 0.00349, "frac_reward_zero_std": 0.0, "grad_norm": 0.27585187554359436, "kl": 2.0096683651208878, "learning_rate": 9.999988666984156e-06, "loss": -0.0624, "num_tokens": 7938891.0, "reward": 1.6513025760650635, "reward_std": 2.6165342330932617, "rewards/rollout_reward_func/mean": 1.6513025760650635, "rewards/rollout_reward_func/std": 2.702831983566284, "sampling/importance_sampling_ratio/max": 0.9559195041656494, "sampling/importance_sampling_ratio/mean": 0.40149810910224915, "sampling/importance_sampling_ratio/min": 9.076789190612468e-27, "sampling/sampling_logp_difference/max": 12.096844673156738, "sampling/sampling_logp_difference/mean": 0.6143110394477844, "step": 349, "step_time": 9.184586949000732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.4695518016815186, "epoch": 0.0035, "grad_norm": 0.26294708251953125, "kl": 1.9027239233255386, "learning_rate": 9.999988594453077e-06, "loss": -0.063, "step": 350, "step_time": 4.87674225900264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.968774251639843, "epoch": 0.00351, "frac_reward_zero_std": 0.25, "grad_norm": 0.2621966600418091, "kl": 0.414563967846334, "learning_rate": 9.999988521690638e-06, "loss": -0.0321, "num_tokens": 7980021.0, "reward": 2.6809256076812744, "reward_std": 2.229326009750366, "rewards/rollout_reward_func/mean": 2.6809256076812744, "rewards/rollout_reward_func/std": 2.817288875579834, "sampling/importance_sampling_ratio/max": 0.9495455026626587, "sampling/importance_sampling_ratio/mean": 0.7608025670051575, "sampling/importance_sampling_ratio/min": 0.141375333070755, "sampling/sampling_logp_difference/max": 1.2902504205703735, "sampling/sampling_logp_difference/mean": 0.09002771228551865, "step": 351, "step_time": 8.654347085001064 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.9341072924435139, "epoch": 0.00352, "grad_norm": 0.2541278898715973, "kl": 0.407507561147213, "learning_rate": 9.99998844869684e-06, "loss": -0.0323, "step": 352, "step_time": 5.848497950997626 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016098485328257084, "completions/clipped_ratio": 0.03125, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 210.0625, "completions/mean_terminated_length": 216.32257080078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4651721492409706, "epoch": 0.00353, "frac_reward_zero_std": 0.0, "grad_norm": 0.14283756911754608, "kl": 0.5907175689935684, "learning_rate": 9.999988375471685e-06, "loss": -0.0308, "num_tokens": 8022250.0, "reward": 1.3853031396865845, "reward_std": 2.219818353652954, "rewards/rollout_reward_func/mean": 1.3853031396865845, "rewards/rollout_reward_func/std": 3.3896071910858154, "sampling/importance_sampling_ratio/max": 0.9471941590309143, "sampling/importance_sampling_ratio/mean": 0.6124390959739685, "sampling/importance_sampling_ratio/min": 0.008735882118344307, "sampling/sampling_logp_difference/max": 3.5799460411071777, "sampling/sampling_logp_difference/mean": 0.20062267780303955, "step": 353, "step_time": 8.627821525997206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4374753087759018, "epoch": 0.00354, "grad_norm": 0.1571730226278305, "kl": 0.5628612078726292, "learning_rate": 9.99998830201517e-06, "loss": -0.0307, "step": 354, "step_time": 4.65183494500161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.021701388526707888, "clip_ratio/low_min": 0.013888888992369175, "clip_ratio/region_mean": 0.021701388526707888, "completions/clipped_ratio": 0.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 261.03125, "completions/mean_terminated_length": 261.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6287411078810692, "epoch": 0.00355, "frac_reward_zero_std": 0.0, "grad_norm": 0.3819624185562134, "kl": 1.1847934946417809, "learning_rate": 9.999988228327299e-06, "loss": -0.0403, "num_tokens": 8067124.0, "reward": 2.155503511428833, "reward_std": 2.3002748489379883, "rewards/rollout_reward_func/mean": 2.155503511428833, "rewards/rollout_reward_func/std": 3.0646376609802246, "sampling/importance_sampling_ratio/max": 1.0417324304580688, "sampling/importance_sampling_ratio/mean": 0.6412115097045898, "sampling/importance_sampling_ratio/min": 0.004277425818145275, "sampling/sampling_logp_difference/max": 2.526695966720581, "sampling/sampling_logp_difference/mean": 0.20874309539794922, "step": 355, "step_time": 8.98368642800051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.619583636522293, "epoch": 0.00356, "grad_norm": 0.4194739758968353, "kl": 1.1674808263778687, "learning_rate": 9.999988154408067e-06, "loss": -0.0409, "step": 356, "step_time": 5.866637713999808 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "completions/clipped_ratio": 0.03125, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 219.74192810058594, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8640389889478683, "epoch": 0.00357, "frac_reward_zero_std": 0.0, "grad_norm": 0.2817835211753845, "kl": 0.8337515853345394, "learning_rate": 9.999988080257476e-06, "loss": -0.0357, "num_tokens": 8110905.0, "reward": 0.24406370520591736, "reward_std": 2.3515827655792236, "rewards/rollout_reward_func/mean": 0.24406370520591736, "rewards/rollout_reward_func/std": 3.1154026985168457, "sampling/importance_sampling_ratio/max": 0.9621064066886902, "sampling/importance_sampling_ratio/mean": 0.5360265970230103, "sampling/importance_sampling_ratio/min": 0.005080046132206917, "sampling/sampling_logp_difference/max": 2.292083263397217, "sampling/sampling_logp_difference/mean": 0.22718463838100433, "step": 357, "step_time": 8.536716192997119 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.012276785913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023640422616153955, "entropy": 1.829159215092659, "epoch": 0.00358, "grad_norm": 0.28971341252326965, "kl": 0.8367649540305138, "learning_rate": 9.99998800587553e-06, "loss": -0.0363, "step": 358, "step_time": 4.646495794002476 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "completions/clipped_ratio": 0.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 401.65625, "completions/mean_terminated_length": 401.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.70551498234272, "epoch": 0.00359, "frac_reward_zero_std": 0.25, "grad_norm": 0.3173404335975647, "kl": 1.219586793333292, "learning_rate": 9.999987931262222e-06, "loss": -0.0158, "num_tokens": 8161286.0, "reward": 2.3184280395507812, "reward_std": 2.0006861686706543, "rewards/rollout_reward_func/mean": 2.3184280395507812, "rewards/rollout_reward_func/std": 2.584787130355835, "sampling/importance_sampling_ratio/max": 1.104809045791626, "sampling/importance_sampling_ratio/mean": 0.5230967998504639, "sampling/importance_sampling_ratio/min": 0.0714048445224762, "sampling/sampling_logp_difference/max": 2.061189651489258, "sampling/sampling_logp_difference/mean": 0.22641366720199585, "step": 359, "step_time": 9.232383213004141 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 1.6826407462358475, "epoch": 0.0036, "grad_norm": 0.30589041113853455, "kl": 1.1964426040649414, "learning_rate": 9.999987856417558e-06, "loss": -0.0166, "step": 360, "step_time": 6.134593370999937 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 406.5625, "completions/mean_terminated_length": 406.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5234084725379944, "epoch": 0.00361, "frac_reward_zero_std": 0.25, "grad_norm": 0.2754845917224884, "kl": 0.6378231756389141, "learning_rate": 9.999987781341531e-06, "loss": -0.0105, "num_tokens": 8211779.0, "reward": 1.9233700037002563, "reward_std": 1.9086968898773193, "rewards/rollout_reward_func/mean": 1.9233700037002563, "rewards/rollout_reward_func/std": 2.6887221336364746, "sampling/importance_sampling_ratio/max": 0.9607442021369934, "sampling/importance_sampling_ratio/mean": 0.5973005890846252, "sampling/importance_sampling_ratio/min": 0.0509052500128746, "sampling/sampling_logp_difference/max": 1.7936636209487915, "sampling/sampling_logp_difference/mean": 0.17178060114383698, "step": 361, "step_time": 9.30107076799868 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020312500186264515, "entropy": 1.502995103597641, "epoch": 0.00362, "grad_norm": 0.24735987186431885, "kl": 0.6232401803135872, "learning_rate": 9.999987706034149e-06, "loss": -0.0112, "step": 362, "step_time": 4.996266091999132 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 683.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 264.15625, "completions/mean_terminated_length": 264.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1874679550528526, "epoch": 0.00363, "frac_reward_zero_std": 0.0, "grad_norm": 0.5505595207214355, "kl": 0.7418975243344903, "learning_rate": 9.999987630495407e-06, "loss": -0.0097, "num_tokens": 8254664.0, "reward": 2.9919590950012207, "reward_std": 2.5565714836120605, "rewards/rollout_reward_func/mean": 2.9919590950012207, "rewards/rollout_reward_func/std": 2.557833194732666, "sampling/importance_sampling_ratio/max": 0.9466567635536194, "sampling/importance_sampling_ratio/mean": 0.6983084678649902, "sampling/importance_sampling_ratio/min": 0.1495341658592224, "sampling/sampling_logp_difference/max": 1.671694278717041, "sampling/sampling_logp_difference/mean": 0.12522225081920624, "step": 363, "step_time": 8.697284582998691 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0234375, "entropy": 1.1594802141189575, "epoch": 0.00364, "grad_norm": 0.2814042568206787, "kl": 0.7696979371830821, "learning_rate": 9.999987554725306e-06, "loss": -0.0123, "step": 364, "step_time": 4.817786374000207 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016406250186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 425.71875, "completions/mean_terminated_length": 425.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7645317018032074, "epoch": 0.00365, "frac_reward_zero_std": 0.0, "grad_norm": 0.37208500504493713, "kl": 0.871656721457839, "learning_rate": 9.999987478723849e-06, "loss": -0.0598, "num_tokens": 8307165.0, "reward": 0.9371204376220703, "reward_std": 2.9746716022491455, "rewards/rollout_reward_func/mean": 0.9371204376220703, "rewards/rollout_reward_func/std": 3.158830404281616, "sampling/importance_sampling_ratio/max": 0.917720377445221, "sampling/importance_sampling_ratio/mean": 0.5043578743934631, "sampling/importance_sampling_ratio/min": 2.2585791337225228e-14, "sampling/sampling_logp_difference/max": 20.30316734313965, "sampling/sampling_logp_difference/mean": 0.3514975607395172, "step": 365, "step_time": 10.407397802997366 }, { "clip_ratio/high_max": 0.026988636702299118, "clip_ratio/high_mean": 0.013494318351149559, "clip_ratio/low_mean": 0.018876262474805117, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.032370580825954676, "entropy": 1.7469503730535507, "epoch": 0.00366, "grad_norm": 0.3324364125728607, "kl": 0.8513135779649019, "learning_rate": 9.99998740249103e-06, "loss": -0.0598, "step": 366, "step_time": 4.90484485900015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 732.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 436.15625, "completions/mean_terminated_length": 426.6128845214844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2785592302680016, "epoch": 0.00367, "frac_reward_zero_std": 0.0, "grad_norm": 0.310019850730896, "kl": 0.8604226745665073, "learning_rate": 9.999987326026854e-06, "loss": -0.0002, "num_tokens": 8360782.0, "reward": -0.38316720724105835, "reward_std": 2.2417984008789062, "rewards/rollout_reward_func/mean": -0.38316720724105835, "rewards/rollout_reward_func/std": 2.773216724395752, "sampling/importance_sampling_ratio/max": 0.9272623658180237, "sampling/importance_sampling_ratio/mean": 0.6028539538383484, "sampling/importance_sampling_ratio/min": 0.06366032361984253, "sampling/sampling_logp_difference/max": 2.1091957092285156, "sampling/sampling_logp_difference/mean": 0.1411171406507492, "step": 367, "step_time": 9.235626193998542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02569444477558136, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02569444477558136, "entropy": 1.2596362233161926, "epoch": 0.00368, "grad_norm": 0.23309101164340973, "kl": 0.8623979464173317, "learning_rate": 9.999987249331323e-06, "loss": -0.0006, "step": 368, "step_time": 4.98633256499852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011386639904230833, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011386639904230833, "completions/clipped_ratio": 0.03125, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 101.65625, "completions/mean_terminated_length": 104.41934967041016, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3875894397497177, "epoch": 0.00369, "frac_reward_zero_std": 0.25, "grad_norm": 0.2303965836763382, "kl": 0.37624684255570173, "learning_rate": 9.999987172404429e-06, "loss": -0.0013, "num_tokens": 8398491.0, "reward": 0.6670448780059814, "reward_std": 2.2297415733337402, "rewards/rollout_reward_func/mean": 0.6670448780059814, "rewards/rollout_reward_func/std": 3.323248863220215, "sampling/importance_sampling_ratio/max": 0.958794355392456, "sampling/importance_sampling_ratio/mean": 0.6488603353500366, "sampling/importance_sampling_ratio/min": 0.0006162874633446336, "sampling/sampling_logp_difference/max": 1.9675636291503906, "sampling/sampling_logp_difference/mean": 0.19243209064006805, "step": 369, "step_time": 9.431921815999885 }, { "clip_ratio/high_max": 0.04340277798473835, "clip_ratio/high_mean": 0.021701388992369175, "clip_ratio/low_mean": 0.018696581479161978, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04039797047153115, "entropy": 1.3611838594079018, "epoch": 0.0037, "grad_norm": 0.15645305812358856, "kl": 0.366543254815042, "learning_rate": 9.999987095246177e-06, "loss": -0.0014, "step": 370, "step_time": 4.58720657000049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 239.5625, "completions/mean_terminated_length": 239.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3037877455353737, "epoch": 0.00371, "frac_reward_zero_std": 0.0, "grad_norm": 0.26662781834602356, "kl": 1.5429198946803808, "learning_rate": 9.999987017856568e-06, "loss": -0.0316, "num_tokens": 8442204.0, "reward": 2.120635747909546, "reward_std": 2.5716147422790527, "rewards/rollout_reward_func/mean": 2.120635747909546, "rewards/rollout_reward_func/std": 2.7047805786132812, "sampling/importance_sampling_ratio/max": 0.9856818318367004, "sampling/importance_sampling_ratio/mean": 0.6892342567443848, "sampling/importance_sampling_ratio/min": 0.061170291155576706, "sampling/sampling_logp_difference/max": 2.595931053161621, "sampling/sampling_logp_difference/mean": 0.1643177717924118, "step": 371, "step_time": 8.615079598002922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3101419657468796, "epoch": 0.00372, "grad_norm": 0.28210484981536865, "kl": 1.6044788649305701, "learning_rate": 9.999986940235598e-06, "loss": -0.0315, "step": 372, "step_time": 4.7395233280003595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 409.34375, "completions/mean_terminated_length": 409.34375, "completions/min_length": 254.0, "completions/min_terminated_length": 254.0, "entropy": 1.1048774346709251, "epoch": 0.00373, "frac_reward_zero_std": 0.0, "grad_norm": 0.26491352915763855, "kl": 0.9067748785018921, "learning_rate": 9.99998686238327e-06, "loss": -0.0359, "num_tokens": 8494651.0, "reward": 2.9773950576782227, "reward_std": 1.741140365600586, "rewards/rollout_reward_func/mean": 2.9773950576782227, "rewards/rollout_reward_func/std": 2.7149977684020996, "sampling/importance_sampling_ratio/max": 1.275861382484436, "sampling/importance_sampling_ratio/mean": 0.6768013834953308, "sampling/importance_sampling_ratio/min": 0.03695348650217056, "sampling/sampling_logp_difference/max": 2.1263604164123535, "sampling/sampling_logp_difference/mean": 0.12662099301815033, "step": 373, "step_time": 10.159648542001378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1019796170294285, "epoch": 0.00374, "grad_norm": 0.2642301023006439, "kl": 0.9038641378283501, "learning_rate": 9.999986784299586e-06, "loss": -0.0359, "step": 374, "step_time": 4.899373686004765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.03125, "completions/max_length": 701.0, "completions/max_terminated_length": 701.0, "completions/mean_length": 254.59375, "completions/mean_terminated_length": 243.90321350097656, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3845877349376678, "epoch": 0.00375, "frac_reward_zero_std": 0.25, "grad_norm": 0.39121106266975403, "kl": 0.8045142628252506, "learning_rate": 9.999986705984542e-06, "loss": -0.0369, "num_tokens": 8537528.0, "reward": 1.819716453552246, "reward_std": 0.9870595932006836, "rewards/rollout_reward_func/mean": 1.819716453552246, "rewards/rollout_reward_func/std": 3.4068214893341064, "sampling/importance_sampling_ratio/max": 1.1742067337036133, "sampling/importance_sampling_ratio/mean": 0.6601614952087402, "sampling/importance_sampling_ratio/min": 5.239702295511961e-05, "sampling/sampling_logp_difference/max": 2.9780263900756836, "sampling/sampling_logp_difference/mean": 0.1673022359609604, "step": 375, "step_time": 9.011632289999397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 1.3954883441329002, "epoch": 0.00376, "grad_norm": 0.36180928349494934, "kl": 0.7890097126364708, "learning_rate": 9.99998662743814e-06, "loss": -0.0381, "step": 376, "step_time": 4.829675114997372 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 237.84375, "completions/mean_terminated_length": 237.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6891810037195683, "epoch": 0.00377, "frac_reward_zero_std": 0.25, "grad_norm": 0.24298670887947083, "kl": 0.7927531450986862, "learning_rate": 9.999986548660378e-06, "loss": -0.0119, "num_tokens": 8580027.0, "reward": 3.031191349029541, "reward_std": 1.8363940715789795, "rewards/rollout_reward_func/mean": 3.031191349029541, "rewards/rollout_reward_func/std": 2.4153523445129395, "sampling/importance_sampling_ratio/max": 0.9616336822509766, "sampling/importance_sampling_ratio/mean": 0.8099668025970459, "sampling/importance_sampling_ratio/min": 0.1540883481502533, "sampling/sampling_logp_difference/max": 1.4911458492279053, "sampling/sampling_logp_difference/mean": 0.062583327293396, "step": 377, "step_time": 9.06740899900251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.6949050538241863, "epoch": 0.00378, "grad_norm": 0.2835559546947479, "kl": 0.7961691748350859, "learning_rate": 9.999986469651259e-06, "loss": -0.0126, "step": 378, "step_time": 5.136212236000574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020312500186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020312500186264515, "completions/clipped_ratio": 0.03125, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 296.71875, "completions/mean_terminated_length": 288.93548583984375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7077213153243065, "epoch": 0.00379, "frac_reward_zero_std": 0.25, "grad_norm": 0.17925234138965607, "kl": 0.5510471314191818, "learning_rate": 9.999986390410781e-06, "loss": -0.0738, "num_tokens": 8626165.0, "reward": 1.53468656539917, "reward_std": 2.3963210582733154, "rewards/rollout_reward_func/mean": 1.53468656539917, "rewards/rollout_reward_func/std": 3.0718045234680176, "sampling/importance_sampling_ratio/max": 0.9630358219146729, "sampling/importance_sampling_ratio/mean": 0.5650612115859985, "sampling/importance_sampling_ratio/min": 0.007991546764969826, "sampling/sampling_logp_difference/max": 2.869220733642578, "sampling/sampling_logp_difference/mean": 0.24710296094417572, "step": 379, "step_time": 9.051939113995104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.04131944477558136, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04131944477558136, "entropy": 1.7719849720597267, "epoch": 0.0038, "grad_norm": 0.17335139214992523, "kl": 0.5573027273640037, "learning_rate": 9.999986310938943e-06, "loss": -0.0741, "step": 380, "step_time": 4.844613993000166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 248.09375, "completions/mean_terminated_length": 248.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9236686937510967, "epoch": 0.00381, "frac_reward_zero_std": 0.25, "grad_norm": 0.2777703106403351, "kl": 0.6690422743558884, "learning_rate": 9.999986231235748e-06, "loss": -0.0147, "num_tokens": 8669548.0, "reward": 3.492656946182251, "reward_std": 1.6360095739364624, "rewards/rollout_reward_func/mean": 3.492656946182251, "rewards/rollout_reward_func/std": 2.0416362285614014, "sampling/importance_sampling_ratio/max": 0.9614759087562561, "sampling/importance_sampling_ratio/mean": 0.744930624961853, "sampling/importance_sampling_ratio/min": 0.12825573980808258, "sampling/sampling_logp_difference/max": 1.9595768451690674, "sampling/sampling_logp_difference/mean": 0.10121843218803406, "step": 381, "step_time": 8.743303163999371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.9337740689516068, "epoch": 0.00382, "grad_norm": 0.2255103588104248, "kl": 0.6726816538721323, "learning_rate": 9.999986151301195e-06, "loss": -0.0154, "step": 382, "step_time": 5.73055318599836 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018876262940466404, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 257.9375, "completions/mean_terminated_length": 257.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0782161056995392, "epoch": 0.00383, "frac_reward_zero_std": 0.25, "grad_norm": 0.22213569283485413, "kl": 0.6959233433008194, "learning_rate": 9.999986071135283e-06, "loss": -0.0136, "num_tokens": 8712653.0, "reward": 2.030038595199585, "reward_std": 1.3326318264007568, "rewards/rollout_reward_func/mean": 2.030038595199585, "rewards/rollout_reward_func/std": 3.2706613540649414, "sampling/importance_sampling_ratio/max": 0.9606509804725647, "sampling/importance_sampling_ratio/mean": 0.7021893262863159, "sampling/importance_sampling_ratio/min": 0.029716050252318382, "sampling/sampling_logp_difference/max": 2.372709274291992, "sampling/sampling_logp_difference/mean": 0.1394049972295761, "step": 383, "step_time": 8.649402141998507 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.015873016323894262, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02155483467504382, "entropy": 1.072866901755333, "epoch": 0.00384, "grad_norm": 0.20205074548721313, "kl": 0.6870684698224068, "learning_rate": 9.999985990738012e-06, "loss": -0.0139, "step": 384, "step_time": 4.662061287001052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 103.28125, "completions/mean_terminated_length": 103.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6115768365561962, "epoch": 0.00385, "frac_reward_zero_std": 0.5, "grad_norm": 0.14488983154296875, "kl": 0.40117793483659625, "learning_rate": 9.999985910109383e-06, "loss": -0.0305, "num_tokens": 8747889.0, "reward": 3.4793825149536133, "reward_std": 1.1826783418655396, "rewards/rollout_reward_func/mean": 3.4793825149536133, "rewards/rollout_reward_func/std": 1.7470303773880005, "sampling/importance_sampling_ratio/max": 1.0491026639938354, "sampling/importance_sampling_ratio/mean": 0.8409682512283325, "sampling/importance_sampling_ratio/min": 0.04923059418797493, "sampling/sampling_logp_difference/max": 2.9414844512939453, "sampling/sampling_logp_difference/mean": 0.08488437533378601, "step": 385, "step_time": 8.089484422001988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6052515655755997, "epoch": 0.00386, "grad_norm": 0.13312529027462006, "kl": 0.3786091944202781, "learning_rate": 9.999985829249397e-06, "loss": -0.031, "step": 386, "step_time": 5.330961357001797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 271.84375, "completions/mean_terminated_length": 271.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1462943069636822, "epoch": 0.00387, "frac_reward_zero_std": 0.5, "grad_norm": 0.25601375102996826, "kl": 0.8812264651060104, "learning_rate": 9.999985748158049e-06, "loss": -0.0234, "num_tokens": 8790356.0, "reward": 2.9054598808288574, "reward_std": 1.152131199836731, "rewards/rollout_reward_func/mean": 2.9054598808288574, "rewards/rollout_reward_func/std": 2.043398141860962, "sampling/importance_sampling_ratio/max": 1.166922688484192, "sampling/importance_sampling_ratio/mean": 0.7347685098648071, "sampling/importance_sampling_ratio/min": 0.01752166450023651, "sampling/sampling_logp_difference/max": 2.3297173976898193, "sampling/sampling_logp_difference/mean": 0.15601056814193726, "step": 387, "step_time": 8.458440506001352 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.1361296735703945, "epoch": 0.00388, "grad_norm": 0.1524808257818222, "kl": 0.8692516423761845, "learning_rate": 9.999985666835346e-06, "loss": -0.0245, "step": 388, "step_time": 4.668906263997997 }, { "clip_ratio/high_max": 0.03775252588093281, "clip_ratio/high_mean": 0.018876262940466404, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02512626303359866, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 503.6875, "completions/mean_terminated_length": 503.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.8831903487443924, "epoch": 0.00389, "frac_reward_zero_std": 0.0, "grad_norm": 0.25341981649398804, "kl": 1.2336293160915375, "learning_rate": 9.999985585281282e-06, "loss": -0.0314, "num_tokens": 8846353.0, "reward": 1.8189563751220703, "reward_std": 2.4778614044189453, "rewards/rollout_reward_func/mean": 1.8189563751220703, "rewards/rollout_reward_func/std": 2.9820988178253174, "sampling/importance_sampling_ratio/max": 0.9277899861335754, "sampling/importance_sampling_ratio/mean": 0.45472609996795654, "sampling/importance_sampling_ratio/min": 0.002194714732468128, "sampling/sampling_logp_difference/max": 2.7406389713287354, "sampling/sampling_logp_difference/mean": 0.2657162547111511, "step": 389, "step_time": 8.768340771999647 }, { "clip_ratio/high_max": 0.06022727396339178, "clip_ratio/high_mean": 0.03011363698169589, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03011363698169589, "entropy": 1.8826814591884613, "epoch": 0.0039, "grad_norm": 0.22279806435108185, "kl": 1.2312758080661297, "learning_rate": 9.999985503495862e-06, "loss": -0.0313, "step": 390, "step_time": 5.7813630040000135 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 368.84375, "completions/mean_terminated_length": 368.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.671282947063446, "epoch": 0.00391, "frac_reward_zero_std": 0.25, "grad_norm": 0.3146381676197052, "kl": 1.1710470989346504, "learning_rate": 9.999985421479081e-06, "loss": -0.035, "num_tokens": 8895086.0, "reward": 2.4937095642089844, "reward_std": 2.027050495147705, "rewards/rollout_reward_func/mean": 2.4937095642089844, "rewards/rollout_reward_func/std": 2.5027801990509033, "sampling/importance_sampling_ratio/max": 0.9633277654647827, "sampling/importance_sampling_ratio/mean": 0.5394853353500366, "sampling/importance_sampling_ratio/min": 0.0052732741460204124, "sampling/sampling_logp_difference/max": 3.25679087638855, "sampling/sampling_logp_difference/mean": 0.2210812270641327, "step": 391, "step_time": 8.850751285002843 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.011931818444281816, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02582070743665099, "entropy": 1.6436434090137482, "epoch": 0.00392, "grad_norm": 0.21516360342502594, "kl": 1.1953881531953812, "learning_rate": 9.999985339230944e-06, "loss": -0.0356, "step": 392, "step_time": 4.824334780993013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 450.28125, "completions/mean_terminated_length": 450.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6503345519304276, "epoch": 0.00393, "frac_reward_zero_std": 0.0, "grad_norm": 0.24669240415096283, "kl": 1.2281599715352058, "learning_rate": 9.999985256751446e-06, "loss": -0.0171, "num_tokens": 8949162.0, "reward": 0.9797239303588867, "reward_std": 2.5303940773010254, "rewards/rollout_reward_func/mean": 0.9797239303588867, "rewards/rollout_reward_func/std": 3.361760377883911, "sampling/importance_sampling_ratio/max": 0.9790135025978088, "sampling/importance_sampling_ratio/mean": 0.5291221141815186, "sampling/importance_sampling_ratio/min": 0.014598107896745205, "sampling/sampling_logp_difference/max": 3.6563427448272705, "sampling/sampling_logp_difference/mean": 0.21206584572792053, "step": 393, "step_time": 8.405811908001851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.6443089544773102, "epoch": 0.00394, "grad_norm": 0.2501630485057831, "kl": 1.2324709594249725, "learning_rate": 9.999985174040591e-06, "loss": -0.0173, "step": 394, "step_time": 5.17276651499742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 713.0, "completions/max_terminated_length": 713.0, "completions/mean_length": 275.75, "completions/mean_terminated_length": 284.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0395774953067303, "epoch": 0.00395, "frac_reward_zero_std": 0.25, "grad_norm": 0.16042377054691315, "kl": 0.43584238924086094, "learning_rate": 9.999985091098378e-06, "loss": 0.0046, "num_tokens": 8990535.0, "reward": 3.375122547149658, "reward_std": 0.9113144874572754, "rewards/rollout_reward_func/mean": 3.375122547149658, "rewards/rollout_reward_func/std": 1.8165526390075684, "sampling/importance_sampling_ratio/max": 0.9626546502113342, "sampling/importance_sampling_ratio/mean": 0.7132977247238159, "sampling/importance_sampling_ratio/min": 0.008001120761036873, "sampling/sampling_logp_difference/max": 2.0046255588531494, "sampling/sampling_logp_difference/mean": 0.13200899958610535, "step": 395, "step_time": 8.818685421001646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.0527314338833094, "epoch": 0.00396, "grad_norm": 0.14906315505504608, "kl": 0.4353948198258877, "learning_rate": 9.999985007924806e-06, "loss": 0.0044, "step": 396, "step_time": 4.516616265995253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 320.71875, "completions/mean_terminated_length": 320.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4666688852012157, "epoch": 0.00397, "frac_reward_zero_std": 0.0, "grad_norm": 0.20183025300502777, "kl": 0.9912069700658321, "learning_rate": 9.999984924519876e-06, "loss": -0.0434, "num_tokens": 9037984.0, "reward": 2.3023900985717773, "reward_std": 2.1240081787109375, "rewards/rollout_reward_func/mean": 2.3023900985717773, "rewards/rollout_reward_func/std": 2.6775059700012207, "sampling/importance_sampling_ratio/max": 0.9998309016227722, "sampling/importance_sampling_ratio/mean": 0.6564942002296448, "sampling/importance_sampling_ratio/min": 0.00605559628456831, "sampling/sampling_logp_difference/max": 2.728451728820801, "sampling/sampling_logp_difference/mean": 0.18562692403793335, "step": 397, "step_time": 8.515039402000184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4195375889539719, "epoch": 0.00398, "grad_norm": 0.20277740061283112, "kl": 0.9749030917882919, "learning_rate": 9.999984840883588e-06, "loss": -0.044, "step": 398, "step_time": 4.636130017997857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "completions/clipped_ratio": 0.0, "completions/max_length": 653.0, "completions/max_terminated_length": 653.0, "completions/mean_length": 176.84375, "completions/mean_terminated_length": 176.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1753949597477913, "epoch": 0.00399, "frac_reward_zero_std": 0.5, "grad_norm": 0.28139615058898926, "kl": 0.386496352031827, "learning_rate": 9.999984757015939e-06, "loss": -0.017, "num_tokens": 9078549.0, "reward": 2.526036262512207, "reward_std": 1.4393532276153564, "rewards/rollout_reward_func/mean": 2.526036262512207, "rewards/rollout_reward_func/std": 2.6202785968780518, "sampling/importance_sampling_ratio/max": 0.9582644104957581, "sampling/importance_sampling_ratio/mean": 0.6881740093231201, "sampling/importance_sampling_ratio/min": 5.823642964292269e-20, "sampling/sampling_logp_difference/max": 18.398603439331055, "sampling/sampling_logp_difference/mean": 0.3780379593372345, "step": 399, "step_time": 9.384990449001634 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009444444440305233, "entropy": 1.1783046200871468, "epoch": 0.004, "grad_norm": 0.2743169367313385, "kl": 0.38536059809848666, "learning_rate": 9.999984672916935e-06, "loss": -0.0174, "step": 400, "step_time": 4.581188274996748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 673.0, "completions/max_terminated_length": 673.0, "completions/mean_length": 288.125, "completions/mean_terminated_length": 288.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2683240547776222, "epoch": 0.00401, "frac_reward_zero_std": 0.25, "grad_norm": 0.428733229637146, "kl": 0.7566943829879165, "learning_rate": 9.999984588586572e-06, "loss": -0.053, "num_tokens": 9123875.0, "reward": 1.7429122924804688, "reward_std": 1.9235162734985352, "rewards/rollout_reward_func/mean": 1.7429122924804688, "rewards/rollout_reward_func/std": 3.210886001586914, "sampling/importance_sampling_ratio/max": 0.9630014300346375, "sampling/importance_sampling_ratio/mean": 0.6463063955307007, "sampling/importance_sampling_ratio/min": 0.013963308185338974, "sampling/sampling_logp_difference/max": 4.214041233062744, "sampling/sampling_logp_difference/mean": 0.17599323391914368, "step": 401, "step_time": 8.59246557899678 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.030849359929561615, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03709936002269387, "entropy": 1.3472656607627869, "epoch": 0.00402, "grad_norm": 0.16581563651561737, "kl": 0.4862985275685787, "learning_rate": 9.999984504024848e-06, "loss": -0.0551, "step": 402, "step_time": 4.639862392996292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022569444496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022569444496184587, "completions/clipped_ratio": 0.03125, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 273.28125, "completions/mean_terminated_length": 281.58062744140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6211724951863289, "epoch": 0.00403, "frac_reward_zero_std": 0.0, "grad_norm": 0.18200482428073883, "kl": 0.7642124127596617, "learning_rate": 9.999984419231768e-06, "loss": -0.0529, "num_tokens": 9168607.0, "reward": 1.0919134616851807, "reward_std": 2.242924690246582, "rewards/rollout_reward_func/mean": 1.0919134616851807, "rewards/rollout_reward_func/std": 3.307076930999756, "sampling/importance_sampling_ratio/max": 0.9617549777030945, "sampling/importance_sampling_ratio/mean": 0.6035383343696594, "sampling/importance_sampling_ratio/min": 0.004227806348353624, "sampling/sampling_logp_difference/max": 2.3259215354919434, "sampling/sampling_logp_difference/mean": 0.2082979530096054, "step": 403, "step_time": 9.484533584998644 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.02467757952399552, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03249007952399552, "entropy": 1.635844886302948, "epoch": 0.00404, "grad_norm": 0.19850341975688934, "kl": 0.7541642859578133, "learning_rate": 9.999984334207328e-06, "loss": -0.0532, "step": 404, "step_time": 4.6605028600006335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 242.34375, "completions/mean_terminated_length": 237.41934204101562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6007032990455627, "epoch": 0.00405, "frac_reward_zero_std": 0.25, "grad_norm": 0.17873376607894897, "kl": 0.9406005404889584, "learning_rate": 9.99998424895153e-06, "loss": -0.044, "num_tokens": 9212824.0, "reward": 2.5065393447875977, "reward_std": 2.110628128051758, "rewards/rollout_reward_func/mean": 2.5065393447875977, "rewards/rollout_reward_func/std": 2.4242451190948486, "sampling/importance_sampling_ratio/max": 0.9526897668838501, "sampling/importance_sampling_ratio/mean": 0.5814390182495117, "sampling/importance_sampling_ratio/min": 0.024728206917643547, "sampling/sampling_logp_difference/max": 1.8727293014526367, "sampling/sampling_logp_difference/mean": 0.1928393840789795, "step": 405, "step_time": 8.561037620000207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.6255469918251038, "epoch": 0.00406, "grad_norm": 0.1750965416431427, "kl": 0.928286574780941, "learning_rate": 9.999984163464374e-06, "loss": -0.0437, "step": 406, "step_time": 4.677692854997076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 229.0, "completions/mean_terminated_length": 229.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4842503927648067, "epoch": 0.00407, "frac_reward_zero_std": 0.25, "grad_norm": 0.2259788066148758, "kl": 0.9605324044823647, "learning_rate": 9.999984077745861e-06, "loss": -0.011, "num_tokens": 9256370.0, "reward": 0.8600946664810181, "reward_std": 1.071134090423584, "rewards/rollout_reward_func/mean": 0.8600946664810181, "rewards/rollout_reward_func/std": 2.980231523513794, "sampling/importance_sampling_ratio/max": 0.962578296661377, "sampling/importance_sampling_ratio/mean": 0.6725947856903076, "sampling/importance_sampling_ratio/min": 0.0802246704697609, "sampling/sampling_logp_difference/max": 1.4235496520996094, "sampling/sampling_logp_difference/mean": 0.156459778547287, "step": 407, "step_time": 9.056328904000111 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.4552698358893394, "epoch": 0.00408, "grad_norm": 0.21242499351501465, "kl": 0.9775998406112194, "learning_rate": 9.999983991795988e-06, "loss": -0.0119, "step": 408, "step_time": 5.165615242000058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 682.0, "completions/max_terminated_length": 682.0, "completions/mean_length": 230.78125, "completions/mean_terminated_length": 230.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1273684352636337, "epoch": 0.00409, "frac_reward_zero_std": 0.5, "grad_norm": 0.18425102531909943, "kl": 0.6990442997775972, "learning_rate": 9.999983905614758e-06, "loss": -0.0156, "num_tokens": 9299610.0, "reward": 2.7308783531188965, "reward_std": 1.4985806941986084, "rewards/rollout_reward_func/mean": 2.7308783531188965, "rewards/rollout_reward_func/std": 2.407177448272705, "sampling/importance_sampling_ratio/max": 0.9644548892974854, "sampling/importance_sampling_ratio/mean": 0.6995316743850708, "sampling/importance_sampling_ratio/min": 1.319401141084439e-26, "sampling/sampling_logp_difference/max": 16.010589599609375, "sampling/sampling_logp_difference/mean": 0.4741711914539337, "step": 409, "step_time": 8.520425749999049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0022321429569274187, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0022321429569274187, "entropy": 1.1386114843189716, "epoch": 0.0041, "grad_norm": 0.1938888281583786, "kl": 0.6935189832001925, "learning_rate": 9.999983819202168e-06, "loss": -0.0157, "step": 410, "step_time": 4.628449052999713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 746.0, "completions/max_terminated_length": 746.0, "completions/mean_length": 301.5625, "completions/mean_terminated_length": 301.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3385748118162155, "epoch": 0.00411, "frac_reward_zero_std": 0.25, "grad_norm": 0.2296014428138733, "kl": 1.018123384565115, "learning_rate": 9.99998373255822e-06, "loss": -0.0458, "num_tokens": 9345689.0, "reward": 1.8083058595657349, "reward_std": 1.955316185951233, "rewards/rollout_reward_func/mean": 1.8083058595657349, "rewards/rollout_reward_func/std": 3.5019490718841553, "sampling/importance_sampling_ratio/max": 0.9630106687545776, "sampling/importance_sampling_ratio/mean": 0.6345935463905334, "sampling/importance_sampling_ratio/min": 0.12496326118707657, "sampling/sampling_logp_difference/max": 1.883876919746399, "sampling/sampling_logp_difference/mean": 0.13683193922042847, "step": 411, "step_time": 8.787215086000288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.32479889690876, "epoch": 0.00412, "grad_norm": 0.21866807341575623, "kl": 1.0488522835075855, "learning_rate": 9.999983645682915e-06, "loss": -0.0466, "step": 412, "step_time": 5.863547598002697 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 614.0, "completions/max_terminated_length": 614.0, "completions/mean_length": 278.53125, "completions/mean_terminated_length": 278.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8733599185943604, "epoch": 0.00413, "frac_reward_zero_std": 0.0, "grad_norm": 0.38103020191192627, "kl": 0.7639380730688572, "learning_rate": 9.999983558576249e-06, "loss": -0.0351, "num_tokens": 9391529.0, "reward": 1.527383804321289, "reward_std": 3.4437432289123535, "rewards/rollout_reward_func/mean": 1.527383804321289, "rewards/rollout_reward_func/std": 3.305248260498047, "sampling/importance_sampling_ratio/max": 0.9417689442634583, "sampling/importance_sampling_ratio/mean": 0.5305474996566772, "sampling/importance_sampling_ratio/min": 0.011786975897848606, "sampling/sampling_logp_difference/max": 2.270688772201538, "sampling/sampling_logp_difference/mean": 0.2683383524417877, "step": 413, "step_time": 8.362449159998505 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.8653930872678757, "epoch": 0.00414, "grad_norm": 0.3702622652053833, "kl": 0.771634254604578, "learning_rate": 9.999983471238228e-06, "loss": -0.0348, "step": 414, "step_time": 4.53217967999808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 754.0, "completions/max_terminated_length": 754.0, "completions/mean_length": 167.0, "completions/mean_terminated_length": 167.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4644916355609894, "epoch": 0.00415, "frac_reward_zero_std": 0.5, "grad_norm": 0.24087989330291748, "kl": 0.8416146114468575, "learning_rate": 9.999983383668846e-06, "loss": -0.0199, "num_tokens": 9431291.0, "reward": 1.2915470600128174, "reward_std": 0.6882351040840149, "rewards/rollout_reward_func/mean": 1.2915470600128174, "rewards/rollout_reward_func/std": 2.9829814434051514, "sampling/importance_sampling_ratio/max": 1.2552306652069092, "sampling/importance_sampling_ratio/mean": 0.6582944989204407, "sampling/importance_sampling_ratio/min": 0.05056947469711304, "sampling/sampling_logp_difference/max": 2.8544087409973145, "sampling/sampling_logp_difference/mean": 0.184041827917099, "step": 415, "step_time": 8.456398943999375 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.016741071827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022991071920841932, "entropy": 1.4685882553458214, "epoch": 0.00416, "grad_norm": 0.2090064138174057, "kl": 0.8634522706270218, "learning_rate": 9.999983295868108e-06, "loss": -0.0199, "step": 416, "step_time": 5.492146507998768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005893640452995896, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005893640452995896, "completions/clipped_ratio": 0.03125, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 308.375, "completions/mean_terminated_length": 317.8064270019531, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.377194494009018, "epoch": 0.00417, "frac_reward_zero_std": 0.0, "grad_norm": 0.2176218181848526, "kl": 0.7961371932178736, "learning_rate": 9.999983207836011e-06, "loss": -0.031, "num_tokens": 9477226.0, "reward": 3.120671272277832, "reward_std": 2.316833257675171, "rewards/rollout_reward_func/mean": 3.120671272277832, "rewards/rollout_reward_func/std": 2.3800957202911377, "sampling/importance_sampling_ratio/max": 0.9549266695976257, "sampling/importance_sampling_ratio/mean": 0.6009823679924011, "sampling/importance_sampling_ratio/min": 2.1598137368199558e-14, "sampling/sampling_logp_difference/max": 14.051292419433594, "sampling/sampling_logp_difference/mean": 0.3396063446998596, "step": 417, "step_time": 8.726921611998478 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.01145833358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "entropy": 1.3800055906176567, "epoch": 0.00418, "grad_norm": 0.2097674012184143, "kl": 0.7855428312905133, "learning_rate": 9.999983119572554e-06, "loss": -0.0311, "step": 418, "step_time": 4.60749810499874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.03125, "completions/max_length": 549.0, "completions/max_terminated_length": 549.0, "completions/mean_length": 84.875, "completions/mean_terminated_length": 87.09677124023438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.230454072356224, "epoch": 0.00419, "frac_reward_zero_std": 0.5, "grad_norm": 0.16242888569831848, "kl": 0.836086668074131, "learning_rate": 9.99998303107774e-06, "loss": -0.0235, "num_tokens": 9513681.0, "reward": 1.6640329360961914, "reward_std": 1.1952049732208252, "rewards/rollout_reward_func/mean": 1.6640329360961914, "rewards/rollout_reward_func/std": 3.1668543815612793, "sampling/importance_sampling_ratio/max": 1.1742010116577148, "sampling/importance_sampling_ratio/mean": 0.7053542137145996, "sampling/importance_sampling_ratio/min": 4.741262372762577e-18, "sampling/sampling_logp_difference/max": 16.695178985595703, "sampling/sampling_logp_difference/mean": 0.35643723607063293, "step": 419, "step_time": 8.024724279997827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 1.2239003255963326, "epoch": 0.0042, "grad_norm": 0.20831312239170074, "kl": 0.8200475834310055, "learning_rate": 9.999982942351567e-06, "loss": -0.0241, "step": 420, "step_time": 5.235722489000182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.025173611473292112, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025173611473292112, "completions/clipped_ratio": 0.03125, "completions/max_length": 463.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 62.78125, "completions/mean_terminated_length": 64.29032135009766, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6148314587771893, "epoch": 0.00421, "frac_reward_zero_std": 0.25, "grad_norm": 0.1107560470700264, "kl": 0.6927661732770503, "learning_rate": 9.999982853394038e-06, "loss": -0.0175, "num_tokens": 9551008.0, "reward": -0.8195567727088928, "reward_std": 0.9088810086250305, "rewards/rollout_reward_func/mean": -0.8195567727088928, "rewards/rollout_reward_func/std": 3.0913820266723633, "sampling/importance_sampling_ratio/max": 0.9914289116859436, "sampling/importance_sampling_ratio/mean": 0.619510293006897, "sampling/importance_sampling_ratio/min": 0.006041227374225855, "sampling/sampling_logp_difference/max": 2.0853612422943115, "sampling/sampling_logp_difference/mean": 0.24715527892112732, "step": 421, "step_time": 7.387068167998223 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.03298611147329211, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04340277845039964, "entropy": 1.608761128038168, "epoch": 0.00422, "grad_norm": 0.11376523971557617, "kl": 0.6881179716438055, "learning_rate": 9.999982764205148e-06, "loss": -0.0175, "step": 422, "step_time": 3.9626954269970156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 726.0, "completions/max_terminated_length": 726.0, "completions/mean_length": 375.625, "completions/mean_terminated_length": 375.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.042355790734291, "epoch": 0.00423, "frac_reward_zero_std": 0.25, "grad_norm": 0.15864357352256775, "kl": 0.7265068851411343, "learning_rate": 9.999982674784901e-06, "loss": -0.0348, "num_tokens": 9600396.0, "reward": -0.5426440834999084, "reward_std": 2.1479439735412598, "rewards/rollout_reward_func/mean": -0.5426440834999084, "rewards/rollout_reward_func/std": 2.759340524673462, "sampling/importance_sampling_ratio/max": 1.0655879974365234, "sampling/importance_sampling_ratio/mean": 0.4591163396835327, "sampling/importance_sampling_ratio/min": 5.552141727057855e-15, "sampling/sampling_logp_difference/max": 14.128543853759766, "sampling/sampling_logp_difference/mean": 0.41973888874053955, "step": 423, "step_time": 8.85772343899771 }, { "clip_ratio/high_max": 0.02336956560611725, "clip_ratio/high_mean": 0.011684782803058624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011684782803058624, "entropy": 2.034168303012848, "epoch": 0.00424, "grad_norm": 0.16303758323192596, "kl": 0.7094617765396833, "learning_rate": 9.999982585133294e-06, "loss": -0.0353, "step": 424, "step_time": 5.398789627000951 }, { "clip_ratio/high_max": 0.0243055559694767, "clip_ratio/high_mean": 0.01215277798473835, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "completions/clipped_ratio": 0.0625, "completions/max_length": 648.0, "completions/max_terminated_length": 648.0, "completions/mean_length": 412.96875, "completions/mean_terminated_length": 410.2333679199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.862203687429428, "epoch": 0.00425, "frac_reward_zero_std": 0.0, "grad_norm": 0.7416437268257141, "kl": 4.802857622504234, "learning_rate": 9.999982495250332e-06, "loss": -0.0303, "num_tokens": 9652535.0, "reward": 1.8972623348236084, "reward_std": 3.0181941986083984, "rewards/rollout_reward_func/mean": 1.8972623348236084, "rewards/rollout_reward_func/std": 3.17962646484375, "sampling/importance_sampling_ratio/max": 0.966421365737915, "sampling/importance_sampling_ratio/mean": 0.5033276081085205, "sampling/importance_sampling_ratio/min": 1.856175692165559e-16, "sampling/sampling_logp_difference/max": 15.309233665466309, "sampling/sampling_logp_difference/mean": 0.41693565249443054, "step": 425, "step_time": 8.668093508997117 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0193452388048172, "entropy": 1.892343670129776, "epoch": 0.00426, "grad_norm": 0.28442105650901794, "kl": 3.0118986889719963, "learning_rate": 9.999982405136008e-06, "loss": -0.0338, "step": 426, "step_time": 4.589455107001413 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 694.0, "completions/max_terminated_length": 694.0, "completions/mean_length": 314.96875, "completions/mean_terminated_length": 314.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.095763772726059, "epoch": 0.00427, "frac_reward_zero_std": 0.25, "grad_norm": 0.25952914357185364, "kl": 0.9114175792783499, "learning_rate": 9.999982314790328e-06, "loss": -0.0586, "num_tokens": 9699117.0, "reward": 1.7865715026855469, "reward_std": 2.030963182449341, "rewards/rollout_reward_func/mean": 1.7865715026855469, "rewards/rollout_reward_func/std": 3.3912978172302246, "sampling/importance_sampling_ratio/max": 1.1375629901885986, "sampling/importance_sampling_ratio/mean": 0.721232533454895, "sampling/importance_sampling_ratio/min": 0.06809907406568527, "sampling/sampling_logp_difference/max": 2.3685030937194824, "sampling/sampling_logp_difference/mean": 0.12230748683214188, "step": 427, "step_time": 8.553273028997864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.1287743672728539, "epoch": 0.00428, "grad_norm": 0.2513168454170227, "kl": 0.9420179277658463, "learning_rate": 9.999982224213289e-06, "loss": -0.0584, "step": 428, "step_time": 4.611839141001838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "completions/clipped_ratio": 0.0, "completions/max_length": 588.0, "completions/max_terminated_length": 588.0, "completions/mean_length": 332.3125, "completions/mean_terminated_length": 332.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9637897908687592, "epoch": 0.00429, "frac_reward_zero_std": 0.0, "grad_norm": 0.24687586724758148, "kl": 0.7746304254978895, "learning_rate": 9.999982133404892e-06, "loss": -0.059, "num_tokens": 9747421.0, "reward": 1.377939224243164, "reward_std": 3.312431812286377, "rewards/rollout_reward_func/mean": 1.377939224243164, "rewards/rollout_reward_func/std": 3.629523754119873, "sampling/importance_sampling_ratio/max": 0.9572874307632446, "sampling/importance_sampling_ratio/mean": 0.4669027626514435, "sampling/importance_sampling_ratio/min": 1.2004515999688211e-14, "sampling/sampling_logp_difference/max": 13.204042434692383, "sampling/sampling_logp_difference/mean": 0.41645699739456177, "step": 429, "step_time": 9.418079520999527 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 2.0158229172229767, "epoch": 0.0043, "grad_norm": 0.21013127267360687, "kl": 0.7820669487118721, "learning_rate": 9.999982042365139e-06, "loss": -0.0594, "step": 430, "step_time": 4.399563027998738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 308.28125, "completions/mean_terminated_length": 308.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.230863481760025, "epoch": 0.00431, "frac_reward_zero_std": 0.25, "grad_norm": 0.13260430097579956, "kl": 1.1894196718931198, "learning_rate": 9.999981951094025e-06, "loss": -0.0225, "num_tokens": 9792563.0, "reward": 2.499048948287964, "reward_std": 1.9449670314788818, "rewards/rollout_reward_func/mean": 2.499048948287964, "rewards/rollout_reward_func/std": 3.0858688354492188, "sampling/importance_sampling_ratio/max": 0.9602502584457397, "sampling/importance_sampling_ratio/mean": 0.6931223273277283, "sampling/importance_sampling_ratio/min": 0.01912606693804264, "sampling/sampling_logp_difference/max": 2.069547176361084, "sampling/sampling_logp_difference/mean": 0.13645033538341522, "step": 431, "step_time": 8.260927689001619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2644130289554596, "epoch": 0.00432, "grad_norm": 0.11745230108499527, "kl": 1.2108533699065447, "learning_rate": 9.999981859591555e-06, "loss": -0.0227, "step": 432, "step_time": 4.50023657499878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 603.0, "completions/max_terminated_length": 603.0, "completions/mean_length": 217.03125, "completions/mean_terminated_length": 217.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2180980667471886, "epoch": 0.00433, "frac_reward_zero_std": 0.25, "grad_norm": 0.29314935207366943, "kl": 0.9031511582434177, "learning_rate": 9.999981767857722e-06, "loss": -0.0326, "num_tokens": 9835310.0, "reward": 3.197054862976074, "reward_std": 1.4751875400543213, "rewards/rollout_reward_func/mean": 3.197054862976074, "rewards/rollout_reward_func/std": 2.4947116374969482, "sampling/importance_sampling_ratio/max": 0.9533083438873291, "sampling/importance_sampling_ratio/mean": 0.6836262941360474, "sampling/importance_sampling_ratio/min": 0.006317696068435907, "sampling/sampling_logp_difference/max": 2.7293097972869873, "sampling/sampling_logp_difference/mean": 0.15831468999385834, "step": 433, "step_time": 9.125707004001015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "entropy": 1.2477739080786705, "epoch": 0.00434, "grad_norm": 0.2838461697101593, "kl": 0.8996962737292051, "learning_rate": 9.999981675892536e-06, "loss": -0.0332, "step": 434, "step_time": 4.4500645240004815 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "completions/clipped_ratio": 0.03125, "completions/max_length": 652.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 218.625, "completions/mean_terminated_length": 211.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3722423762083054, "epoch": 0.00435, "frac_reward_zero_std": 0.25, "grad_norm": 0.3135797083377838, "kl": 1.4917678404599428, "learning_rate": 9.99998158369599e-06, "loss": -0.0409, "num_tokens": 9877799.0, "reward": 1.1246362924575806, "reward_std": 2.1648411750793457, "rewards/rollout_reward_func/mean": 1.1246362924575806, "rewards/rollout_reward_func/std": 3.1434876918792725, "sampling/importance_sampling_ratio/max": 0.9632418155670166, "sampling/importance_sampling_ratio/mean": 0.4901379942893982, "sampling/importance_sampling_ratio/min": 3.1687058033892657e-13, "sampling/sampling_logp_difference/max": 18.066349029541016, "sampling/sampling_logp_difference/mean": 0.5538672208786011, "step": 435, "step_time": 8.536381535001055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 2.4079708456993103, "epoch": 0.00436, "grad_norm": 0.3091908097267151, "kl": 1.4097816534340382, "learning_rate": 9.999981491268085e-06, "loss": -0.0415, "step": 436, "step_time": 4.542028665995531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 145.4375, "completions/mean_terminated_length": 145.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7244005128741264, "epoch": 0.00437, "frac_reward_zero_std": 0.25, "grad_norm": 0.2428680956363678, "kl": 0.76961431466043, "learning_rate": 9.999981398608821e-06, "loss": -0.0321, "num_tokens": 9916293.0, "reward": 2.469675302505493, "reward_std": 1.7765514850616455, "rewards/rollout_reward_func/mean": 2.469675302505493, "rewards/rollout_reward_func/std": 2.753077983856201, "sampling/importance_sampling_ratio/max": 0.9568741321563721, "sampling/importance_sampling_ratio/mean": 0.6240268349647522, "sampling/importance_sampling_ratio/min": 0.001722671207971871, "sampling/sampling_logp_difference/max": 3.4282279014587402, "sampling/sampling_logp_difference/mean": 0.2692504823207855, "step": 437, "step_time": 9.292745076003484 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.02784455195069313, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03565705195069313, "entropy": 1.7196654677391052, "epoch": 0.00438, "grad_norm": 0.22189737856388092, "kl": 0.798678757622838, "learning_rate": 9.999981305718201e-06, "loss": -0.0324, "step": 438, "step_time": 4.621336174999669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 214.34375, "completions/mean_terminated_length": 211.73333740234375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.650777667760849, "epoch": 0.00439, "frac_reward_zero_std": 0.25, "grad_norm": 0.8125907182693481, "kl": 1.2808409007266164, "learning_rate": 9.999981212596224e-06, "loss": -0.0486, "num_tokens": 9958041.0, "reward": 2.8660218715667725, "reward_std": 1.89542818069458, "rewards/rollout_reward_func/mean": 2.8660218715667725, "rewards/rollout_reward_func/std": 2.332723617553711, "sampling/importance_sampling_ratio/max": 2.4112792015075684, "sampling/importance_sampling_ratio/mean": 0.7309424877166748, "sampling/importance_sampling_ratio/min": 2.1311954867542227e-08, "sampling/sampling_logp_difference/max": 3.7871508598327637, "sampling/sampling_logp_difference/mean": 0.28786996006965637, "step": 439, "step_time": 8.273476718999518 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0059523810632526875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018452381249517202, "entropy": 1.7031737491488457, "epoch": 0.0044, "grad_norm": 0.2205708771944046, "kl": 1.281104183755815, "learning_rate": 9.999981119242886e-06, "loss": -0.0512, "step": 440, "step_time": 4.502173271002903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 303.25, "completions/mean_terminated_length": 292.0322570800781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.503381609916687, "epoch": 0.00441, "frac_reward_zero_std": 0.0, "grad_norm": 0.30895790457725525, "kl": 1.2475199103355408, "learning_rate": 9.99998102565819e-06, "loss": -0.0433, "num_tokens": 10006722.0, "reward": -0.0675249770283699, "reward_std": 2.7172555923461914, "rewards/rollout_reward_func/mean": -0.0675249770283699, "rewards/rollout_reward_func/std": 2.9873218536376953, "sampling/importance_sampling_ratio/max": 1.1568914651870728, "sampling/importance_sampling_ratio/mean": 0.4011974036693573, "sampling/importance_sampling_ratio/min": 0.014756831340491772, "sampling/sampling_logp_difference/max": 2.1311728954315186, "sampling/sampling_logp_difference/mean": 0.295357882976532, "step": 441, "step_time": 9.152525895999133 }, { "clip_ratio/high_max": 0.018181818537414074, "clip_ratio/high_mean": 0.009090909268707037, "clip_ratio/low_mean": 0.019444444682449102, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02853535395115614, "entropy": 2.4962492883205414, "epoch": 0.00442, "grad_norm": 0.2991968095302582, "kl": 1.2347796633839607, "learning_rate": 9.999980931842137e-06, "loss": -0.0438, "step": 442, "step_time": 5.005277482998281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009539473801851273, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "completions/clipped_ratio": 0.03125, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 348.25, "completions/mean_terminated_length": 358.9677429199219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.3206842988729477, "epoch": 0.00443, "frac_reward_zero_std": 0.0, "grad_norm": 0.2385902851819992, "kl": 1.152601595968008, "learning_rate": 9.999980837794724e-06, "loss": -0.0556, "num_tokens": 10054213.0, "reward": 1.0908904075622559, "reward_std": 2.7579092979431152, "rewards/rollout_reward_func/mean": 1.0908904075622559, "rewards/rollout_reward_func/std": 3.0582191944122314, "sampling/importance_sampling_ratio/max": 0.9593425989151001, "sampling/importance_sampling_ratio/mean": 0.4808034598827362, "sampling/importance_sampling_ratio/min": 0.005296511109918356, "sampling/sampling_logp_difference/max": 3.0801711082458496, "sampling/sampling_logp_difference/mean": 0.3438183665275574, "step": 443, "step_time": 8.740731391000736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 2.3196137696504593, "epoch": 0.00444, "grad_norm": 0.24598205089569092, "kl": 1.1384193114936352, "learning_rate": 9.999980743515956e-06, "loss": -0.0561, "step": 444, "step_time": 4.728730539996832 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009661835851147771, "completions/clipped_ratio": 0.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 426.8125, "completions/mean_terminated_length": 426.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.6798468828201294, "epoch": 0.00445, "frac_reward_zero_std": 0.0, "grad_norm": 0.37412333488464355, "kl": 2.038691610097885, "learning_rate": 9.999980649005829e-06, "loss": -0.0257, "num_tokens": 10106875.0, "reward": 1.5306071043014526, "reward_std": 2.342071056365967, "rewards/rollout_reward_func/mean": 1.5306071043014526, "rewards/rollout_reward_func/std": 2.9454681873321533, "sampling/importance_sampling_ratio/max": 0.8023101687431335, "sampling/importance_sampling_ratio/mean": 0.29678037762641907, "sampling/importance_sampling_ratio/min": 1.499636595219644e-17, "sampling/sampling_logp_difference/max": 16.972978591918945, "sampling/sampling_logp_difference/mean": 0.5600482225418091, "step": 445, "step_time": 9.457899643999554 }, { "clip_ratio/high_max": 0.0701388893648982, "clip_ratio/high_mean": 0.0350694446824491, "clip_ratio/low_mean": 0.012626262847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04769570706412196, "entropy": 2.6548232436180115, "epoch": 0.00446, "grad_norm": 0.26734021306037903, "kl": 1.9918025806546211, "learning_rate": 9.999980554264342e-06, "loss": -0.0269, "step": 446, "step_time": 5.21925482900042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 712.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 373.53125, "completions/mean_terminated_length": 373.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7885100096464157, "epoch": 0.00447, "frac_reward_zero_std": 0.25, "grad_norm": 0.21695572137832642, "kl": 0.9369567893445492, "learning_rate": 9.999980459291499e-06, "loss": -0.029, "num_tokens": 10154681.0, "reward": 2.5365965366363525, "reward_std": 2.1450350284576416, "rewards/rollout_reward_func/mean": 2.5365965366363525, "rewards/rollout_reward_func/std": 2.7063097953796387, "sampling/importance_sampling_ratio/max": 1.0434380769729614, "sampling/importance_sampling_ratio/mean": 0.5669941902160645, "sampling/importance_sampling_ratio/min": 0.056022219359874725, "sampling/sampling_logp_difference/max": 1.6801260709762573, "sampling/sampling_logp_difference/mean": 0.21194574236869812, "step": 447, "step_time": 8.904687720996662 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02569444477558136, "entropy": 1.750549539923668, "epoch": 0.00448, "grad_norm": 0.19973124563694, "kl": 0.9510974623262882, "learning_rate": 9.999980364087295e-06, "loss": -0.0297, "step": 448, "step_time": 4.744810832004077 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013134058564901352, "completions/clipped_ratio": 0.03125, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 281.0, "completions/mean_terminated_length": 272.93548583984375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.006989151239395, "epoch": 0.00449, "frac_reward_zero_std": 0.25, "grad_norm": 0.17066644132137299, "kl": 0.9097038377076387, "learning_rate": 9.999980268651735e-06, "loss": -0.0233, "num_tokens": 10199946.0, "reward": 1.7338037490844727, "reward_std": 1.0723884105682373, "rewards/rollout_reward_func/mean": 1.7338037490844727, "rewards/rollout_reward_func/std": 3.0673022270202637, "sampling/importance_sampling_ratio/max": 0.9773352146148682, "sampling/importance_sampling_ratio/mean": 0.5098289251327515, "sampling/importance_sampling_ratio/min": 0.0014190145302563906, "sampling/sampling_logp_difference/max": 2.767803192138672, "sampling/sampling_logp_difference/mean": 0.3172318935394287, "step": 449, "step_time": 8.715510251002343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.9769493266940117, "epoch": 0.0045, "grad_norm": 0.19112570583820343, "kl": 0.8778569027781487, "learning_rate": 9.999980172984815e-06, "loss": -0.0235, "step": 450, "step_time": 5.5493493030026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.03125, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 240.78125, "completions/mean_terminated_length": 236.09677124023438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4385041892528534, "epoch": 0.00451, "frac_reward_zero_std": 0.25, "grad_norm": 0.16835950314998627, "kl": 1.2053363136947155, "learning_rate": 9.99998007708654e-06, "loss": -0.0566, "num_tokens": 10243467.0, "reward": 1.9512805938720703, "reward_std": 2.2319042682647705, "rewards/rollout_reward_func/mean": 1.9512805938720703, "rewards/rollout_reward_func/std": 3.2095282077789307, "sampling/importance_sampling_ratio/max": 1.0473575592041016, "sampling/importance_sampling_ratio/mean": 0.5995588302612305, "sampling/importance_sampling_ratio/min": 0.008927933871746063, "sampling/sampling_logp_difference/max": 2.1565139293670654, "sampling/sampling_logp_difference/mean": 0.19074752926826477, "step": 451, "step_time": 7.9730885720000515 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009548611240461469, "entropy": 1.438181184232235, "epoch": 0.00452, "grad_norm": 0.162053644657135, "kl": 1.1921640038490295, "learning_rate": 9.999979980956905e-06, "loss": -0.0571, "step": 452, "step_time": 4.45361839299585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 710.0, "completions/max_terminated_length": 710.0, "completions/mean_length": 393.875, "completions/mean_terminated_length": 393.875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6051545292139053, "epoch": 0.00453, "frac_reward_zero_std": 0.0, "grad_norm": 0.3173859715461731, "kl": 1.3091577962040901, "learning_rate": 9.999979884595913e-06, "loss": -0.0324, "num_tokens": 10294793.0, "reward": 2.490157127380371, "reward_std": 1.9971914291381836, "rewards/rollout_reward_func/mean": 2.490157127380371, "rewards/rollout_reward_func/std": 3.114956855773926, "sampling/importance_sampling_ratio/max": 0.9091593027114868, "sampling/importance_sampling_ratio/mean": 0.5938606858253479, "sampling/importance_sampling_ratio/min": 5.344514258169802e-16, "sampling/sampling_logp_difference/max": 15.07096004486084, "sampling/sampling_logp_difference/mean": 0.35658276081085205, "step": 453, "step_time": 8.869870031001483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 1.5936676561832428, "epoch": 0.00454, "grad_norm": 0.32279977202415466, "kl": 1.3491567857563496, "learning_rate": 9.999979788003562e-06, "loss": -0.0332, "step": 454, "step_time": 5.208440092997989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016964286100119352, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.016964286100119352, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 409.53125, "completions/mean_terminated_length": 409.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8234039396047592, "epoch": 0.00455, "frac_reward_zero_std": 0.0, "grad_norm": 0.3848475217819214, "kl": 1.5535787045955658, "learning_rate": 9.999979691179852e-06, "loss": -0.0614, "num_tokens": 10347055.0, "reward": 1.6665663719177246, "reward_std": 3.034369707107544, "rewards/rollout_reward_func/mean": 1.6665663719177246, "rewards/rollout_reward_func/std": 3.2258474826812744, "sampling/importance_sampling_ratio/max": 0.9148305058479309, "sampling/importance_sampling_ratio/mean": 0.48311400413513184, "sampling/importance_sampling_ratio/min": 0.003395181382074952, "sampling/sampling_logp_difference/max": 2.601893663406372, "sampling/sampling_logp_difference/mean": 0.2527132034301758, "step": 455, "step_time": 9.152828218997456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.033238636795431376, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.033238636795431376, "entropy": 1.8242628872394562, "epoch": 0.00456, "grad_norm": 0.32521864771842957, "kl": 1.5843795239925385, "learning_rate": 9.999979594124784e-06, "loss": -0.0628, "step": 456, "step_time": 4.660011811000004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 55.46875, "completions/mean_terminated_length": 55.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.683131068944931, "epoch": 0.00457, "frac_reward_zero_std": 0.75, "grad_norm": 0.14451630413532257, "kl": 0.22014949889853597, "learning_rate": 9.99997949683836e-06, "loss": -0.0109, "num_tokens": 10380774.0, "reward": 2.8686673641204834, "reward_std": 0.8822793960571289, "rewards/rollout_reward_func/mean": 2.8686673641204834, "rewards/rollout_reward_func/std": 2.4115965366363525, "sampling/importance_sampling_ratio/max": 0.9706569314002991, "sampling/importance_sampling_ratio/mean": 0.8560599684715271, "sampling/importance_sampling_ratio/min": 0.02457243949174881, "sampling/sampling_logp_difference/max": 1.898886799812317, "sampling/sampling_logp_difference/mean": 0.07814322412014008, "step": 457, "step_time": 7.4665830499961885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6776793785393238, "epoch": 0.00458, "grad_norm": 0.14652101695537567, "kl": 0.21355413179844618, "learning_rate": 9.999979399320576e-06, "loss": -0.0109, "step": 458, "step_time": 4.655784917998972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 396.40625, "completions/mean_terminated_length": 396.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2192815989255905, "epoch": 0.00459, "frac_reward_zero_std": 0.0, "grad_norm": 0.3488735854625702, "kl": 1.497399851679802, "learning_rate": 9.999979301571434e-06, "loss": -0.0606, "num_tokens": 10432430.0, "reward": 2.358025074005127, "reward_std": 2.7751078605651855, "rewards/rollout_reward_func/mean": 2.358025074005127, "rewards/rollout_reward_func/std": 3.2443161010742188, "sampling/importance_sampling_ratio/max": 1.095847249031067, "sampling/importance_sampling_ratio/mean": 0.6864190101623535, "sampling/importance_sampling_ratio/min": 0.03227991610765457, "sampling/sampling_logp_difference/max": 2.703238010406494, "sampling/sampling_logp_difference/mean": 0.14312446117401123, "step": 459, "step_time": 8.98219331600194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.03370535792782903, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.03370535792782903, "entropy": 1.2217194139957428, "epoch": 0.0046, "grad_norm": 0.23510883748531342, "kl": 1.5532296448946, "learning_rate": 9.999979203590935e-06, "loss": -0.0619, "step": 460, "step_time": 4.5179099469987705 }, { "clip_ratio/high_max": 0.02223320212215185, "clip_ratio/high_mean": 0.011116601061075926, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02361660124734044, "completions/clipped_ratio": 0.03125, "completions/max_length": 721.0, "completions/max_terminated_length": 721.0, "completions/mean_length": 453.34375, "completions/mean_terminated_length": 445.83868408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.22023144364357, "epoch": 0.00461, "frac_reward_zero_std": 0.0, "grad_norm": 0.3470205068588257, "kl": 1.8156204782426357, "learning_rate": 9.999979105379078e-06, "loss": -0.0312, "num_tokens": 10487277.0, "reward": 0.9151858687400818, "reward_std": 1.7213425636291504, "rewards/rollout_reward_func/mean": 0.9151858687400818, "rewards/rollout_reward_func/std": 3.0145692825317383, "sampling/importance_sampling_ratio/max": 0.9585036635398865, "sampling/importance_sampling_ratio/mean": 0.43632376194000244, "sampling/importance_sampling_ratio/min": 3.876791652146494e-06, "sampling/sampling_logp_difference/max": 3.93989896774292, "sampling/sampling_logp_difference/mean": 0.3569856584072113, "step": 461, "step_time": 8.801781861000563 }, { "clip_ratio/high_max": 0.037258454598486423, "clip_ratio/high_mean": 0.018629227299243212, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029045894276350737, "entropy": 2.2180516496300697, "epoch": 0.00462, "grad_norm": 0.3287392258644104, "kl": 1.7549058683216572, "learning_rate": 9.999979006935863e-06, "loss": -0.0323, "step": 462, "step_time": 5.334200328001316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 717.0, "completions/max_terminated_length": 717.0, "completions/mean_length": 409.65625, "completions/mean_terminated_length": 409.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.327076032757759, "epoch": 0.00463, "frac_reward_zero_std": 0.0, "grad_norm": 0.42826321721076965, "kl": 0.9361725831404328, "learning_rate": 9.999978908261289e-06, "loss": -0.04, "num_tokens": 10539656.0, "reward": 1.1606733798980713, "reward_std": 3.3136329650878906, "rewards/rollout_reward_func/mean": 1.1606733798980713, "rewards/rollout_reward_func/std": 3.194823741912842, "sampling/importance_sampling_ratio/max": 1.0178433656692505, "sampling/importance_sampling_ratio/mean": 0.6010065078735352, "sampling/importance_sampling_ratio/min": 8.637860446469858e-05, "sampling/sampling_logp_difference/max": 4.094727993011475, "sampling/sampling_logp_difference/mean": 0.19611576199531555, "step": 463, "step_time": 9.415577952999229 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.026636905502527952, "clip_ratio/low_min": 0.010416666977107525, "clip_ratio/region_mean": 0.03556547733023763, "entropy": 1.30862557888031, "epoch": 0.00464, "grad_norm": 0.31587356328964233, "kl": 0.9356804005801678, "learning_rate": 9.999978809355357e-06, "loss": -0.0428, "step": 464, "step_time": 4.746301327000765 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020138889085501432, "completions/clipped_ratio": 0.0, "completions/max_length": 333.0, "completions/max_terminated_length": 333.0, "completions/mean_length": 59.75, "completions/mean_terminated_length": 59.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6264088153839111, "epoch": 0.00465, "frac_reward_zero_std": 0.5, "grad_norm": 0.27586352825164795, "kl": 0.809514569118619, "learning_rate": 9.999978710218067e-06, "loss": -0.0136, "num_tokens": 10575468.0, "reward": 2.057710647583008, "reward_std": 1.285008192062378, "rewards/rollout_reward_func/mean": 2.057710647583008, "rewards/rollout_reward_func/std": 3.05918025970459, "sampling/importance_sampling_ratio/max": 0.9614396095275879, "sampling/importance_sampling_ratio/mean": 0.6385653614997864, "sampling/importance_sampling_ratio/min": 0.0035127075389027596, "sampling/sampling_logp_difference/max": 2.31451678276062, "sampling/sampling_logp_difference/mean": 0.24930748343467712, "step": 465, "step_time": 6.741009435005253 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02569444477558136, "entropy": 1.6404257714748383, "epoch": 0.00466, "grad_norm": 0.21419338881969452, "kl": 0.8568767490796745, "learning_rate": 9.999978610849418e-06, "loss": -0.0136, "step": 466, "step_time": 3.6457254530032515 }, { "clip_ratio/high_max": 0.02003205195069313, "clip_ratio/high_mean": 0.010016025975346565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "completions/clipped_ratio": 0.03125, "completions/max_length": 692.0, "completions/max_terminated_length": 692.0, "completions/mean_length": 242.0, "completions/mean_terminated_length": 234.8064422607422, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2938694804906845, "epoch": 0.00467, "frac_reward_zero_std": 0.25, "grad_norm": 4.012087345123291, "kl": 0.7110883798450232, "learning_rate": 9.999978511249413e-06, "loss": 0.0218, "num_tokens": 10619986.0, "reward": 1.3758615255355835, "reward_std": 1.552269458770752, "rewards/rollout_reward_func/mean": 1.3758615255355835, "rewards/rollout_reward_func/std": 2.947014808654785, "sampling/importance_sampling_ratio/max": 1.2114049196243286, "sampling/importance_sampling_ratio/mean": 0.6550114750862122, "sampling/importance_sampling_ratio/min": 0.011979692615568638, "sampling/sampling_logp_difference/max": 2.298370838165283, "sampling/sampling_logp_difference/mean": 0.17173241078853607, "step": 467, "step_time": 9.558289531994888 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010890151839703321, "entropy": 1.2499201968312263, "epoch": 0.00468, "grad_norm": 0.6852825880050659, "kl": 0.6995876468718052, "learning_rate": 9.99997841141805e-06, "loss": 0.0159, "step": 468, "step_time": 4.646173514000111 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.009615384973585606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022809830028563738, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 472.875, "completions/mean_terminated_length": 472.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2745232433080673, "epoch": 0.00469, "frac_reward_zero_std": 0.0, "grad_norm": 0.34106168150901794, "kl": 1.3788381442427635, "learning_rate": 9.999978311355328e-06, "loss": -0.0482, "num_tokens": 10673193.0, "reward": 0.929858922958374, "reward_std": 2.578484058380127, "rewards/rollout_reward_func/mean": 0.929858922958374, "rewards/rollout_reward_func/std": 2.665842294692993, "sampling/importance_sampling_ratio/max": 1.2038469314575195, "sampling/importance_sampling_ratio/mean": 0.3825010061264038, "sampling/importance_sampling_ratio/min": 0.005097619257867336, "sampling/sampling_logp_difference/max": 2.7458419799804688, "sampling/sampling_logp_difference/mean": 0.35878872871398926, "step": 469, "step_time": 8.898019583002679 }, { "clip_ratio/high_max": 0.038888889364898205, "clip_ratio/high_mean": 0.019444444682449102, "clip_ratio/low_mean": 0.022569444496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04201388917863369, "entropy": 2.2551735788583755, "epoch": 0.0047, "grad_norm": 0.3345230519771576, "kl": 1.3723508268594742, "learning_rate": 9.999978211061248e-06, "loss": -0.0497, "step": 470, "step_time": 4.809431419000248 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "completions/clipped_ratio": 0.0, "completions/max_length": 719.0, "completions/max_terminated_length": 719.0, "completions/mean_length": 555.6875, "completions/mean_terminated_length": 555.6875, "completions/min_length": 372.0, "completions/min_terminated_length": 372.0, "entropy": 2.179171770811081, "epoch": 0.00471, "frac_reward_zero_std": 0.0, "grad_norm": 0.2896701693534851, "kl": 2.4456317722797394, "learning_rate": 9.999978110535811e-06, "loss": -0.0054, "num_tokens": 10730493.0, "reward": 0.9083907604217529, "reward_std": 2.3964195251464844, "rewards/rollout_reward_func/mean": 0.9083907604217529, "rewards/rollout_reward_func/std": 2.647118330001831, "sampling/importance_sampling_ratio/max": 0.997483491897583, "sampling/importance_sampling_ratio/mean": 0.40206605195999146, "sampling/importance_sampling_ratio/min": 0.006989242043346167, "sampling/sampling_logp_difference/max": 2.6462461948394775, "sampling/sampling_logp_difference/mean": 0.33155548572540283, "step": 471, "step_time": 9.546268578997115 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02582070743665099, "entropy": 2.1550545394420624, "epoch": 0.00472, "grad_norm": 0.22586014866828918, "kl": 2.348675847053528, "learning_rate": 9.999978009779015e-06, "loss": -0.006, "step": 472, "step_time": 5.270979466000426 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029513888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 703.0, "completions/max_terminated_length": 703.0, "completions/mean_length": 273.09375, "completions/mean_terminated_length": 273.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3139574229717255, "epoch": 0.00473, "frac_reward_zero_std": 0.25, "grad_norm": 0.4033682942390442, "kl": 1.1267263498157263, "learning_rate": 9.999977908790861e-06, "loss": -0.0299, "num_tokens": 10773805.0, "reward": 1.243711233139038, "reward_std": 1.5522351264953613, "rewards/rollout_reward_func/mean": 1.243711233139038, "rewards/rollout_reward_func/std": 3.3351027965545654, "sampling/importance_sampling_ratio/max": 0.9721920490264893, "sampling/importance_sampling_ratio/mean": 0.6593877673149109, "sampling/importance_sampling_ratio/min": 0.010208606719970703, "sampling/sampling_logp_difference/max": 2.2486233711242676, "sampling/sampling_logp_difference/mean": 0.18811219930648804, "step": 473, "step_time": 8.416929508002795 }, { "clip_ratio/high_max": 0.07638888899236917, "clip_ratio/high_mean": 0.03819444449618459, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04600694449618459, "entropy": 1.2746058478951454, "epoch": 0.00474, "grad_norm": 0.15448766946792603, "kl": 1.1069509405642748, "learning_rate": 9.999977807571349e-06, "loss": -0.0312, "step": 474, "step_time": 4.682508402000167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022161836037412286, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022161836037412286, "completions/clipped_ratio": 0.03125, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 359.53125, "completions/mean_terminated_length": 351.58062744140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2702538222074509, "epoch": 0.00475, "frac_reward_zero_std": 0.25, "grad_norm": 0.3058786392211914, "kl": 0.909590495750308, "learning_rate": 9.99997770612048e-06, "loss": -0.0751, "num_tokens": 10821660.0, "reward": 2.284728765487671, "reward_std": 2.115673780441284, "rewards/rollout_reward_func/mean": 2.284728765487671, "rewards/rollout_reward_func/std": 2.6521737575531006, "sampling/importance_sampling_ratio/max": 1.0062024593353271, "sampling/importance_sampling_ratio/mean": 0.6440948247909546, "sampling/importance_sampling_ratio/min": 0.012728707864880562, "sampling/sampling_logp_difference/max": 2.02046275138855, "sampling/sampling_logp_difference/mean": 0.17390283942222595, "step": 475, "step_time": 9.540827348000676 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028645833488553762, "entropy": 1.2516313288360834, "epoch": 0.00476, "grad_norm": 0.20129148662090302, "kl": 0.8593135653063655, "learning_rate": 9.999977604438252e-06, "loss": -0.0757, "step": 476, "step_time": 5.196166400000948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 202.59375, "completions/mean_terminated_length": 202.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7262525036931038, "epoch": 0.00477, "frac_reward_zero_std": 0.5, "grad_norm": 0.6373583674430847, "kl": 0.9747399315237999, "learning_rate": 9.999977502524667e-06, "loss": -0.0098, "num_tokens": 10863070.0, "reward": 3.057525157928467, "reward_std": 1.2908124923706055, "rewards/rollout_reward_func/mean": 3.057525157928467, "rewards/rollout_reward_func/std": 2.4036824703216553, "sampling/importance_sampling_ratio/max": 0.9719278812408447, "sampling/importance_sampling_ratio/mean": 0.7830299139022827, "sampling/importance_sampling_ratio/min": 0.14849069714546204, "sampling/sampling_logp_difference/max": 1.7084441184997559, "sampling/sampling_logp_difference/mean": 0.09871283918619156, "step": 477, "step_time": 8.501113989999794 }, { "clip_ratio/high_max": 0.041666666977107525, "clip_ratio/high_mean": 0.020833333488553762, "clip_ratio/low_mean": 0.01785714365541935, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03869047714397311, "entropy": 0.7273271605372429, "epoch": 0.00478, "grad_norm": 0.21442443132400513, "kl": 0.8763797078281641, "learning_rate": 9.999977400379722e-06, "loss": -0.0113, "step": 478, "step_time": 4.811899804002678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 192.90625, "completions/mean_terminated_length": 192.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5547960195690393, "epoch": 0.00479, "frac_reward_zero_std": 0.25, "grad_norm": 0.24422191083431244, "kl": 0.5441262107342482, "learning_rate": 9.99997729800342e-06, "loss": -0.0323, "num_tokens": 10902979.0, "reward": 3.52651047706604, "reward_std": 1.4398868083953857, "rewards/rollout_reward_func/mean": 3.52651047706604, "rewards/rollout_reward_func/std": 2.1911070346832275, "sampling/importance_sampling_ratio/max": 1.107340931892395, "sampling/importance_sampling_ratio/mean": 0.8867872953414917, "sampling/importance_sampling_ratio/min": 0.00013286575267557055, "sampling/sampling_logp_difference/max": 2.891014575958252, "sampling/sampling_logp_difference/mean": 0.09460555016994476, "step": 479, "step_time": 7.92070909800168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.5602920949459076, "epoch": 0.0048, "grad_norm": 0.2523859441280365, "kl": 0.5459944494068623, "learning_rate": 9.999977195395762e-06, "loss": -0.0328, "step": 480, "step_time": 5.322957778998898 }, { "clip_ratio/high_max": 0.045138888992369175, "clip_ratio/high_mean": 0.022569444496184587, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028819444589316845, "completions/clipped_ratio": 0.0, "completions/max_length": 731.0, "completions/max_terminated_length": 731.0, "completions/mean_length": 436.875, "completions/mean_terminated_length": 436.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9728070721030235, "epoch": 0.00481, "frac_reward_zero_std": 0.0, "grad_norm": 0.3188297748565674, "kl": 1.2183297201991081, "learning_rate": 9.999977092556745e-06, "loss": 0.0045, "num_tokens": 10954760.0, "reward": 1.2553541660308838, "reward_std": 1.7536381483078003, "rewards/rollout_reward_func/mean": 1.2553541660308838, "rewards/rollout_reward_func/std": 2.784682512283325, "sampling/importance_sampling_ratio/max": 1.0207996368408203, "sampling/importance_sampling_ratio/mean": 0.5385324358940125, "sampling/importance_sampling_ratio/min": 0.010509870946407318, "sampling/sampling_logp_difference/max": 2.668506145477295, "sampling/sampling_logp_difference/mean": 0.28348276019096375, "step": 481, "step_time": 8.828887368001233 }, { "clip_ratio/high_max": 0.059375000186264515, "clip_ratio/high_mean": 0.035937500186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.035937500186264515, "entropy": 1.9384094178676605, "epoch": 0.00482, "grad_norm": 0.38537362217903137, "kl": 1.1744553595781326, "learning_rate": 9.999976989486369e-06, "loss": 0.0051, "step": 482, "step_time": 4.796921044000555 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.03819444449618459, "completions/clipped_ratio": 0.0, "completions/max_length": 699.0, "completions/max_terminated_length": 699.0, "completions/mean_length": 257.78125, "completions/mean_terminated_length": 257.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.114930510520935, "epoch": 0.00483, "frac_reward_zero_std": 0.0, "grad_norm": 0.9760074019432068, "kl": 1.5707873962819576, "learning_rate": 9.999976886184636e-06, "loss": -0.0343, "num_tokens": 11000874.0, "reward": 0.021856069564819336, "reward_std": 2.5229339599609375, "rewards/rollout_reward_func/mean": 0.021856069564819336, "rewards/rollout_reward_func/std": 3.191693067550659, "sampling/importance_sampling_ratio/max": 0.9600971341133118, "sampling/importance_sampling_ratio/mean": 0.4563210606575012, "sampling/importance_sampling_ratio/min": 0.03069194406270981, "sampling/sampling_logp_difference/max": 3.1812286376953125, "sampling/sampling_logp_difference/mean": 0.29436004161834717, "step": 483, "step_time": 8.589960551998956 }, { "clip_ratio/high_max": 0.050347222946584225, "clip_ratio/high_mean": 0.025173611473292112, "clip_ratio/low_mean": 0.0234375, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04861111147329211, "entropy": 2.0949898809194565, "epoch": 0.00484, "grad_norm": 0.22844935953617096, "kl": 1.449398223310709, "learning_rate": 9.999976782651545e-06, "loss": -0.0354, "step": 484, "step_time": 5.612919659997715 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 388.03125, "completions/mean_terminated_length": 388.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6655789613723755, "epoch": 0.00485, "frac_reward_zero_std": 0.0, "grad_norm": 0.3723224401473999, "kl": 1.4900403693318367, "learning_rate": 9.999976678887095e-06, "loss": -0.0447, "num_tokens": 11052730.0, "reward": 0.021248959004878998, "reward_std": 2.65728497505188, "rewards/rollout_reward_func/mean": 0.021248959004878998, "rewards/rollout_reward_func/std": 3.0395803451538086, "sampling/importance_sampling_ratio/max": 1.0092761516571045, "sampling/importance_sampling_ratio/mean": 0.5492430329322815, "sampling/importance_sampling_ratio/min": 0.013601827435195446, "sampling/sampling_logp_difference/max": 2.489377021789551, "sampling/sampling_logp_difference/mean": 0.23223736882209778, "step": 485, "step_time": 8.856390252996789 }, { "clip_ratio/high_max": 0.038888889364898205, "clip_ratio/high_mean": 0.019444444682449102, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03194444486871362, "entropy": 1.66528682410717, "epoch": 0.00486, "grad_norm": 0.28071078658103943, "kl": 1.4677733331918716, "learning_rate": 9.999976574891289e-06, "loss": -0.0453, "step": 486, "step_time": 4.71275998299825 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.014165981439873576, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0245826484169811, "completions/clipped_ratio": 0.0, "completions/max_length": 725.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 271.53125, "completions/mean_terminated_length": 271.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2566679455339909, "epoch": 0.00487, "frac_reward_zero_std": 0.25, "grad_norm": 0.21945655345916748, "kl": 1.02526974119246, "learning_rate": 9.999976470664125e-06, "loss": -0.0304, "num_tokens": 11098464.0, "reward": 2.882798194885254, "reward_std": 1.6231449842453003, "rewards/rollout_reward_func/mean": 2.882798194885254, "rewards/rollout_reward_func/std": 2.4924867153167725, "sampling/importance_sampling_ratio/max": 0.9716309309005737, "sampling/importance_sampling_ratio/mean": 0.6798602342605591, "sampling/importance_sampling_ratio/min": 2.098559902384022e-18, "sampling/sampling_logp_difference/max": 22.165550231933594, "sampling/sampling_logp_difference/mean": 0.38694876432418823, "step": 487, "step_time": 8.691885268000988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.024092452134937048, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024092452134937048, "entropy": 1.2452382631599903, "epoch": 0.00488, "grad_norm": 0.2273077666759491, "kl": 0.9910112414509058, "learning_rate": 9.999976366205601e-06, "loss": -0.0306, "step": 488, "step_time": 5.152619621998383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "completions/clipped_ratio": 0.03125, "completions/max_length": 668.0, "completions/max_terminated_length": 668.0, "completions/mean_length": 315.65625, "completions/mean_terminated_length": 304.6773986816406, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6954635605216026, "epoch": 0.00489, "frac_reward_zero_std": 0.0, "grad_norm": 0.29881051182746887, "kl": 1.362201064825058, "learning_rate": 9.99997626151572e-06, "loss": -0.0583, "num_tokens": 11145749.0, "reward": -0.3886372447013855, "reward_std": 1.7357012033462524, "rewards/rollout_reward_func/mean": -0.3886372447013855, "rewards/rollout_reward_func/std": 3.296334743499756, "sampling/importance_sampling_ratio/max": 0.9569827914237976, "sampling/importance_sampling_ratio/mean": 0.5055504441261292, "sampling/importance_sampling_ratio/min": 0.012866111472249031, "sampling/sampling_logp_difference/max": 2.7010960578918457, "sampling/sampling_logp_difference/mean": 0.2803792357444763, "step": 489, "step_time": 8.906157781999354 }, { "clip_ratio/high_max": 0.046875, "clip_ratio/high_mean": 0.0234375, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03750000009313226, "entropy": 1.6701686307787895, "epoch": 0.0049, "grad_norm": 0.13625726103782654, "kl": 1.3096492439508438, "learning_rate": 9.999976156594483e-06, "loss": -0.0588, "step": 490, "step_time": 4.557141623999996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 498.0, "completions/max_terminated_length": 498.0, "completions/mean_length": 116.4375, "completions/mean_terminated_length": 116.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.27915761433541775, "epoch": 0.00491, "frac_reward_zero_std": 0.5, "grad_norm": 0.7664339542388916, "kl": 0.4719970393925905, "learning_rate": 9.999976051441887e-06, "loss": -0.0076, "num_tokens": 11181704.0, "reward": 3.7617135047912598, "reward_std": 1.0268809795379639, "rewards/rollout_reward_func/mean": 3.7617135047912598, "rewards/rollout_reward_func/std": 1.4579967260360718, "sampling/importance_sampling_ratio/max": 0.9724729657173157, "sampling/importance_sampling_ratio/mean": 0.9250845313072205, "sampling/importance_sampling_ratio/min": 0.46573296189308167, "sampling/sampling_logp_difference/max": 0.49645519256591797, "sampling/sampling_logp_difference/mean": 0.023401256650686264, "step": 491, "step_time": 7.446358300996508 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 0.27294569835066795, "epoch": 0.00492, "grad_norm": 0.3267718553543091, "kl": 0.46822477877140045, "learning_rate": 9.999975946057932e-06, "loss": -0.0099, "step": 492, "step_time": 4.639618762996179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008097166195511818, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008097166195511818, "completions/clipped_ratio": 0.03125, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 206.78125, "completions/mean_terminated_length": 212.9354705810547, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7020797003060579, "epoch": 0.00493, "frac_reward_zero_std": 0.0, "grad_norm": 0.2421378493309021, "kl": 0.5838824631646276, "learning_rate": 9.99997584044262e-06, "loss": -0.0465, "num_tokens": 11222584.0, "reward": 2.246624708175659, "reward_std": 2.5198936462402344, "rewards/rollout_reward_func/mean": 2.246624708175659, "rewards/rollout_reward_func/std": 2.8655853271484375, "sampling/importance_sampling_ratio/max": 0.9727534651756287, "sampling/importance_sampling_ratio/mean": 0.8022865056991577, "sampling/importance_sampling_ratio/min": 0.0018754175398498774, "sampling/sampling_logp_difference/max": 2.1770668029785156, "sampling/sampling_logp_difference/mean": 0.1331184208393097, "step": 493, "step_time": 8.68816211000376 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.016666667070239782, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023245614487677813, "entropy": 0.6989128217101097, "epoch": 0.00494, "grad_norm": 0.17017315328121185, "kl": 0.5541959041729569, "learning_rate": 9.99997573459595e-06, "loss": -0.0469, "step": 494, "step_time": 4.481280395000795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.03125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 445.59375, "completions/mean_terminated_length": 444.6451416015625, "completions/min_length": 11.0, "completions/min_terminated_length": 11.0, "entropy": 1.0923077538609505, "epoch": 0.00495, "frac_reward_zero_std": 0.0, "grad_norm": 0.1390458047389984, "kl": 1.1695045493543148, "learning_rate": 9.999975628517921e-06, "loss": -0.0333, "num_tokens": 11275831.0, "reward": 1.520524501800537, "reward_std": 2.268050193786621, "rewards/rollout_reward_func/mean": 1.520524501800537, "rewards/rollout_reward_func/std": 2.947422981262207, "sampling/importance_sampling_ratio/max": 0.9573947787284851, "sampling/importance_sampling_ratio/mean": 0.6878199577331543, "sampling/importance_sampling_ratio/min": 3.677517683770877e-16, "sampling/sampling_logp_difference/max": 21.680278778076172, "sampling/sampling_logp_difference/mean": 0.2902003824710846, "step": 495, "step_time": 8.468377442999554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009588068351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009588068351149559, "entropy": 1.099364172667265, "epoch": 0.00496, "grad_norm": 0.12337230890989304, "kl": 1.1821853257715702, "learning_rate": 9.999975522208535e-06, "loss": -0.0334, "step": 496, "step_time": 4.576890328000445 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 435.46875, "completions/mean_terminated_length": 435.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3451991975307465, "epoch": 0.00497, "frac_reward_zero_std": 0.25, "grad_norm": 0.4814055562019348, "kl": 1.1729890666902065, "learning_rate": 9.999975415667793e-06, "loss": 0.0004, "num_tokens": 11326168.0, "reward": 2.087308406829834, "reward_std": 0.6296377182006836, "rewards/rollout_reward_func/mean": 2.087308406829834, "rewards/rollout_reward_func/std": 2.4021730422973633, "sampling/importance_sampling_ratio/max": 0.9689602851867676, "sampling/importance_sampling_ratio/mean": 0.5991185903549194, "sampling/importance_sampling_ratio/min": 0.0470615029335022, "sampling/sampling_logp_difference/max": 2.449958562850952, "sampling/sampling_logp_difference/mean": 0.19027084112167358, "step": 497, "step_time": 9.498257230998206 }, { "clip_ratio/high_max": 0.038888889364898205, "clip_ratio/high_mean": 0.026388889644294977, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03531746193766594, "entropy": 1.3711797073483467, "epoch": 0.00498, "grad_norm": 0.33413493633270264, "kl": 1.1582091003656387, "learning_rate": 9.999975308895691e-06, "loss": -0.0007, "step": 498, "step_time": 4.803519434997725 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0193452388048172, "completions/clipped_ratio": 0.0, "completions/max_length": 468.0, "completions/max_terminated_length": 468.0, "completions/mean_length": 40.78125, "completions/mean_terminated_length": 40.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6383540071547031, "epoch": 0.00499, "frac_reward_zero_std": 0.75, "grad_norm": 0.1325276643037796, "kl": 0.2036738325841725, "learning_rate": 9.999975201892232e-06, "loss": -0.0247, "num_tokens": 11357753.0, "reward": 2.862497091293335, "reward_std": 0.8343582153320312, "rewards/rollout_reward_func/mean": 2.862497091293335, "rewards/rollout_reward_func/std": 2.5487191677093506, "sampling/importance_sampling_ratio/max": 0.9728832840919495, "sampling/importance_sampling_ratio/mean": 0.8403700590133667, "sampling/importance_sampling_ratio/min": 1.7088845766885467e-19, "sampling/sampling_logp_difference/max": 17.151893615722656, "sampling/sampling_logp_difference/mean": 0.3522942364215851, "step": 499, "step_time": 7.309508360998734 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0193452388048172, "entropy": 0.6308235935866833, "epoch": 0.005, "grad_norm": 0.11883936822414398, "kl": 0.19966594967991114, "learning_rate": 9.999975094657417e-06, "loss": -0.0247, "step": 500, "step_time": 4.051650139999765 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 638.0, "completions/max_terminated_length": 638.0, "completions/mean_length": 372.21875, "completions/mean_terminated_length": 372.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.067616831511259, "epoch": 0.00501, "frac_reward_zero_std": 0.25, "grad_norm": 0.37452641129493713, "kl": 1.2253615036606789, "learning_rate": 9.999974987191241e-06, "loss": -0.018, "num_tokens": 11405886.0, "reward": 1.7190873622894287, "reward_std": 2.0700836181640625, "rewards/rollout_reward_func/mean": 1.7190873622894287, "rewards/rollout_reward_func/std": 2.986659049987793, "sampling/importance_sampling_ratio/max": 0.9735283255577087, "sampling/importance_sampling_ratio/mean": 0.6490414142608643, "sampling/importance_sampling_ratio/min": 0.01141214370727539, "sampling/sampling_logp_difference/max": 2.184123992919922, "sampling/sampling_logp_difference/mean": 0.1729019582271576, "step": 501, "step_time": 9.618100226003662 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 1.0949161984026432, "epoch": 0.00502, "grad_norm": 0.2828841805458069, "kl": 1.2578935250639915, "learning_rate": 9.99997487949371e-06, "loss": -0.0184, "step": 502, "step_time": 4.5641489590016135 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completions/clipped_ratio": 0.03125, "completions/max_length": 643.0, "completions/max_terminated_length": 643.0, "completions/mean_length": 184.65625, "completions/mean_terminated_length": 173.32257080078125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5697703212499619, "epoch": 0.00503, "frac_reward_zero_std": 0.0, "grad_norm": 0.3590024411678314, "kl": 0.7138505624607205, "learning_rate": 9.999974771564817e-06, "loss": -0.0278, "num_tokens": 11447561.0, "reward": -0.35367125272750854, "reward_std": 2.4791197776794434, "rewards/rollout_reward_func/mean": -0.35367125272750854, "rewards/rollout_reward_func/std": 3.248717784881592, "sampling/importance_sampling_ratio/max": 0.9698725342750549, "sampling/importance_sampling_ratio/mean": 0.5839766263961792, "sampling/importance_sampling_ratio/min": 2.336911997247171e-09, "sampling/sampling_logp_difference/max": 3.8701331615448, "sampling/sampling_logp_difference/mean": 0.30074357986450195, "step": 503, "step_time": 8.340087071996095 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.02864583395421505, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03125000069849193, "entropy": 1.6351257413625717, "epoch": 0.00504, "grad_norm": 0.5504533648490906, "kl": 0.7369287684559822, "learning_rate": 9.999974663404571e-06, "loss": -0.0284, "step": 504, "step_time": 4.52990339799544 }, { "clip_ratio/high_max": 0.02211538515985012, "clip_ratio/high_mean": 0.01105769257992506, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105769257992506, "completions/clipped_ratio": 0.03125, "completions/max_length": 542.0, "completions/max_terminated_length": 542.0, "completions/mean_length": 222.75, "completions/mean_terminated_length": 229.41934204101562, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8415559977293015, "epoch": 0.00505, "frac_reward_zero_std": 0.0, "grad_norm": 0.10294467955827713, "kl": 1.3116106949746609, "learning_rate": 9.999974555012965e-06, "loss": -0.017, "num_tokens": 11490355.0, "reward": 1.324973464012146, "reward_std": 1.941458821296692, "rewards/rollout_reward_func/mean": 1.324973464012146, "rewards/rollout_reward_func/std": 3.1527299880981445, "sampling/importance_sampling_ratio/max": 1.9522209167480469, "sampling/importance_sampling_ratio/mean": 0.7920087575912476, "sampling/importance_sampling_ratio/min": 0.00016560328367631882, "sampling/sampling_logp_difference/max": 2.9466586112976074, "sampling/sampling_logp_difference/mean": 0.16588374972343445, "step": 505, "step_time": 8.755389049998485 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009539473801851273, "entropy": 0.8636810816824436, "epoch": 0.00506, "grad_norm": 0.35995060205459595, "kl": 1.3140285145491362, "learning_rate": 9.999974446390002e-06, "loss": -0.0172, "step": 506, "step_time": 4.662026477002655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 562.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 191.65625, "completions/mean_terminated_length": 191.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0057082399725914, "epoch": 0.00507, "frac_reward_zero_std": 0.25, "grad_norm": 0.5320253372192383, "kl": 0.9346179347485304, "learning_rate": 9.999974337535678e-06, "loss": -0.0301, "num_tokens": 11532238.0, "reward": 0.5337050557136536, "reward_std": 2.1739702224731445, "rewards/rollout_reward_func/mean": 0.5337050557136536, "rewards/rollout_reward_func/std": 3.3935389518737793, "sampling/importance_sampling_ratio/max": 0.9719185829162598, "sampling/importance_sampling_ratio/mean": 0.7541460990905762, "sampling/importance_sampling_ratio/min": 0.012775443494319916, "sampling/sampling_logp_difference/max": 2.5272538661956787, "sampling/sampling_logp_difference/mean": 0.10798398405313492, "step": 507, "step_time": 7.873289929999373 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.030381944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.037326388992369175, "entropy": 1.110816266387701, "epoch": 0.00508, "grad_norm": 0.4659232497215271, "kl": 0.959235723130405, "learning_rate": 9.99997422845e-06, "loss": -0.0319, "step": 508, "step_time": 4.338715113999569 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1084489226341248, "epoch": 0.00509, "frac_reward_zero_std": 0.25, "grad_norm": 0.2320268154144287, "kl": 0.7292503155767918, "learning_rate": 9.999974119132963e-06, "loss": -0.0324, "num_tokens": 11572902.0, "reward": 1.1012446880340576, "reward_std": 1.806955099105835, "rewards/rollout_reward_func/mean": 1.1012446880340576, "rewards/rollout_reward_func/std": 3.1238412857055664, "sampling/importance_sampling_ratio/max": 0.9724841117858887, "sampling/importance_sampling_ratio/mean": 0.7281638383865356, "sampling/importance_sampling_ratio/min": 0.04792139679193497, "sampling/sampling_logp_difference/max": 2.5358526706695557, "sampling/sampling_logp_difference/mean": 0.16348125040531158, "step": 509, "step_time": 8.852645117001885 }, { "clip_ratio/high_max": 0.031746032647788525, "clip_ratio/high_mean": 0.015873016323894262, "clip_ratio/low_mean": 0.018229166977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03410218330100179, "entropy": 1.1983898282051086, "epoch": 0.0051, "grad_norm": 0.24264882504940033, "kl": 0.7997870342805982, "learning_rate": 9.99997400958457e-06, "loss": -0.0327, "step": 510, "step_time": 5.008049281004787 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 663.0, "completions/max_terminated_length": 663.0, "completions/mean_length": 240.59375, "completions/mean_terminated_length": 240.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2371927499771118, "epoch": 0.00511, "frac_reward_zero_std": 0.5, "grad_norm": 0.26677265763282776, "kl": 1.6094885915517807, "learning_rate": 9.999973899804816e-06, "loss": -0.0333, "num_tokens": 11615846.0, "reward": 2.61930775642395, "reward_std": 1.4485139846801758, "rewards/rollout_reward_func/mean": 2.61930775642395, "rewards/rollout_reward_func/std": 2.502798318862915, "sampling/importance_sampling_ratio/max": 0.9710572361946106, "sampling/importance_sampling_ratio/mean": 0.6661592125892639, "sampling/importance_sampling_ratio/min": 4.475912146309263e-16, "sampling/sampling_logp_difference/max": 13.022862434387207, "sampling/sampling_logp_difference/mean": 0.39429134130477905, "step": 511, "step_time": 8.316132154002844 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.030803571920841932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03861607192084193, "entropy": 1.277265727519989, "epoch": 0.00512, "grad_norm": 0.21964606642723083, "kl": 1.7159928604960442, "learning_rate": 9.999973789793707e-06, "loss": -0.0336, "step": 512, "step_time": 4.584410135001235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02361111156642437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02361111156642437, "completions/clipped_ratio": 0.0, "completions/max_length": 602.0, "completions/max_terminated_length": 602.0, "completions/mean_length": 223.71875, "completions/mean_terminated_length": 223.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8105544857680798, "epoch": 0.00513, "frac_reward_zero_std": 0.25, "grad_norm": 0.19294385612010956, "kl": 1.136420350521803, "learning_rate": 9.999973679551239e-06, "loss": -0.0429, "num_tokens": 11658371.0, "reward": 2.9190139770507812, "reward_std": 1.4837439060211182, "rewards/rollout_reward_func/mean": 2.9190139770507812, "rewards/rollout_reward_func/std": 3.0677003860473633, "sampling/importance_sampling_ratio/max": 0.9732106328010559, "sampling/importance_sampling_ratio/mean": 0.790021538734436, "sampling/importance_sampling_ratio/min": 0.1086108461022377, "sampling/sampling_logp_difference/max": 2.9620466232299805, "sampling/sampling_logp_difference/mean": 0.12401381134986877, "step": 513, "step_time": 8.826701398000296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.81892991065979, "epoch": 0.00514, "grad_norm": 0.19037900865077972, "kl": 1.1334829926490784, "learning_rate": 9.999973569077414e-06, "loss": -0.0428, "step": 514, "step_time": 4.977072407999003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007604166632518172, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007604166632518172, "completions/clipped_ratio": 0.0, "completions/max_length": 1350.0, "completions/max_terminated_length": 1350.0, "completions/mean_length": 582.0, "completions/mean_terminated_length": 582.0, "completions/min_length": 19.0, "completions/min_terminated_length": 19.0, "entropy": 1.8718441501259804, "epoch": 0.00515, "frac_reward_zero_std": 0.0, "grad_norm": 0.2581190764904022, "kl": 1.5146983116865158, "learning_rate": 9.99997345837223e-06, "loss": -0.0563, "num_tokens": 11716354.0, "reward": 2.1307406425476074, "reward_std": 2.660648822784424, "rewards/rollout_reward_func/mean": 2.1307406425476074, "rewards/rollout_reward_func/std": 3.1088008880615234, "sampling/importance_sampling_ratio/max": 0.938105583190918, "sampling/importance_sampling_ratio/mean": 0.48506397008895874, "sampling/importance_sampling_ratio/min": 7.536120251337424e-21, "sampling/sampling_logp_difference/max": 18.018766403198242, "sampling/sampling_logp_difference/mean": 0.5937107801437378, "step": 515, "step_time": 11.322576446002131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.019286859314888716, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.019286859314888716, "entropy": 1.837228186428547, "epoch": 0.00516, "grad_norm": 0.23852279782295227, "kl": 1.4774679839611053, "learning_rate": 9.99997334743569e-06, "loss": -0.0567, "step": 516, "step_time": 6.171723542996915 }, { "clip_ratio/high_max": 0.03525641094893217, "clip_ratio/high_mean": 0.017628205474466085, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023310023825615644, "completions/clipped_ratio": 0.03125, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 493.34375, "completions/mean_terminated_length": 491.3548278808594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6181087344884872, "epoch": 0.00517, "frac_reward_zero_std": 0.0, "grad_norm": 0.17936541140079498, "kl": 1.2860920690000057, "learning_rate": 9.999973236267791e-06, "loss": -0.0489, "num_tokens": 11771361.0, "reward": 3.183598279953003, "reward_std": 2.253291130065918, "rewards/rollout_reward_func/mean": 3.183598279953003, "rewards/rollout_reward_func/std": 2.895526647567749, "sampling/importance_sampling_ratio/max": 0.935730516910553, "sampling/importance_sampling_ratio/mean": 0.5067151784896851, "sampling/importance_sampling_ratio/min": 1.4409271203714478e-17, "sampling/sampling_logp_difference/max": 19.735750198364258, "sampling/sampling_logp_difference/mean": 0.4045090675354004, "step": 517, "step_time": 10.843932957001016 }, { "clip_ratio/high_max": 0.06426282227039337, "clip_ratio/high_mean": 0.032131411135196686, "clip_ratio/low_mean": 0.018302010837942362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05043342197313905, "entropy": 1.5549091845750809, "epoch": 0.00518, "grad_norm": 0.16844524443149567, "kl": 1.2769131511449814, "learning_rate": 9.999973124868533e-06, "loss": -0.0493, "step": 518, "step_time": 6.547028258000864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010156250093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010156250093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 368.875, "completions/mean_terminated_length": 368.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2260284423828125, "epoch": 0.00519, "frac_reward_zero_std": 0.0, "grad_norm": 0.18501244485378265, "kl": 1.369009006768465, "learning_rate": 9.99997301323792e-06, "loss": -0.0637, "num_tokens": 11821513.0, "reward": 1.6326794624328613, "reward_std": 2.0790657997131348, "rewards/rollout_reward_func/mean": 1.6326794624328613, "rewards/rollout_reward_func/std": 3.573641777038574, "sampling/importance_sampling_ratio/max": 0.9637501835823059, "sampling/importance_sampling_ratio/mean": 0.644456684589386, "sampling/importance_sampling_ratio/min": 4.876291173828333e-15, "sampling/sampling_logp_difference/max": 21.07297706604004, "sampling/sampling_logp_difference/mean": 0.32510846853256226, "step": 519, "step_time": 10.671556067001802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.009114583488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 1.213031329214573, "epoch": 0.0052, "grad_norm": 0.19246578216552734, "kl": 1.3123983815312386, "learning_rate": 9.999972901375948e-06, "loss": -0.0638, "step": 520, "step_time": 5.374832954003068 }, { "clip_ratio/high_max": 0.0033783784601837397, "clip_ratio/high_mean": 0.0016891892300918698, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0046653797617182136, "completions/clipped_ratio": 0.09375, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 334.34375, "completions/mean_terminated_length": 338.7930908203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5620160177350044, "epoch": 0.00521, "frac_reward_zero_std": 0.25, "grad_norm": 0.32498881220817566, "kl": 1.157297294586897, "learning_rate": 9.99997278928262e-06, "loss": -0.0327, "num_tokens": 11868909.0, "reward": 0.4308894872665405, "reward_std": 2.1747424602508545, "rewards/rollout_reward_func/mean": 0.4308894872665405, "rewards/rollout_reward_func/std": 3.163965940475464, "sampling/importance_sampling_ratio/max": 1.0488111972808838, "sampling/importance_sampling_ratio/mean": 0.48004722595214844, "sampling/importance_sampling_ratio/min": 0.002125906990841031, "sampling/sampling_logp_difference/max": 3.2091639041900635, "sampling/sampling_logp_difference/mean": 0.2647768557071686, "step": 521, "step_time": 10.678526391999185 }, { "clip_ratio/high_max": 0.017267267452552915, "clip_ratio/high_mean": 0.008633633726276457, "clip_ratio/low_mean": 0.013392857508733869, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022026491235010326, "entropy": 1.5570540577173233, "epoch": 0.00522, "grad_norm": 0.22685834765434265, "kl": 1.121982192620635, "learning_rate": 9.999972676957931e-06, "loss": -0.0339, "step": 522, "step_time": 6.050108885001464 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 213.375, "completions/mean_terminated_length": 213.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7186517287045717, "epoch": 0.00523, "frac_reward_zero_std": 0.5, "grad_norm": 0.09361090511083603, "kl": 0.9294833242893219, "learning_rate": 9.999972564401887e-06, "loss": -0.0344, "num_tokens": 11910393.0, "reward": 3.8601200580596924, "reward_std": 1.236332893371582, "rewards/rollout_reward_func/mean": 3.8601200580596924, "rewards/rollout_reward_func/std": 1.6743046045303345, "sampling/importance_sampling_ratio/max": 1.0347226858139038, "sampling/importance_sampling_ratio/mean": 0.8367918729782104, "sampling/importance_sampling_ratio/min": 0.01722603850066662, "sampling/sampling_logp_difference/max": 2.3407175540924072, "sampling/sampling_logp_difference/mean": 0.09859670698642731, "step": 523, "step_time": 9.887516703000074 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 0.69873989187181, "epoch": 0.00524, "grad_norm": 0.10568681359291077, "kl": 0.9215970151126385, "learning_rate": 9.999972451614485e-06, "loss": -0.0344, "step": 524, "step_time": 5.083156035996581 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.020906177815049887, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028718677815049887, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 435.875, "completions/mean_terminated_length": 435.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.419496238231659, "epoch": 0.00525, "frac_reward_zero_std": 0.0, "grad_norm": 0.43807005882263184, "kl": 1.1147456616163254, "learning_rate": 9.999972338595724e-06, "loss": -0.0133, "num_tokens": 11962680.0, "reward": 1.3836101293563843, "reward_std": 3.364255428314209, "rewards/rollout_reward_func/mean": 1.3836101293563843, "rewards/rollout_reward_func/std": 3.7661073207855225, "sampling/importance_sampling_ratio/max": 1.0932384729385376, "sampling/importance_sampling_ratio/mean": 0.6128150224685669, "sampling/importance_sampling_ratio/min": 0.04323237016797066, "sampling/sampling_logp_difference/max": 1.5763890743255615, "sampling/sampling_logp_difference/mean": 0.17804570496082306, "step": 525, "step_time": 11.00250541499554 }, { "clip_ratio/high_max": 0.046875000931322575, "clip_ratio/high_mean": 0.027343750465661287, "clip_ratio/low_mean": 0.024378399830311537, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.051722150295972824, "entropy": 1.4112077504396439, "epoch": 0.00526, "grad_norm": 0.3034813404083252, "kl": 1.0243652164936066, "learning_rate": 9.999972225345608e-06, "loss": -0.0151, "step": 526, "step_time": 6.6796042830010265 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017045455053448677, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 398.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.955936323851347, "epoch": 0.00527, "frac_reward_zero_std": 0.25, "grad_norm": 0.22711080312728882, "kl": 1.2492630332708359, "learning_rate": 9.999972111864134e-06, "loss": -0.0442, "num_tokens": 12012302.0, "reward": 3.4857630729675293, "reward_std": 1.692855715751648, "rewards/rollout_reward_func/mean": 3.4857630729675293, "rewards/rollout_reward_func/std": 2.5527260303497314, "sampling/importance_sampling_ratio/max": 0.9698985815048218, "sampling/importance_sampling_ratio/mean": 0.7303816676139832, "sampling/importance_sampling_ratio/min": 2.4012179679687537e-16, "sampling/sampling_logp_difference/max": 15.283987045288086, "sampling/sampling_logp_difference/mean": 0.485219269990921, "step": 527, "step_time": 10.8076550220012 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01657197019085288, "entropy": 0.934865839779377, "epoch": 0.00528, "grad_norm": 0.2240070253610611, "kl": 1.2365938276052475, "learning_rate": 9.9999719981513e-06, "loss": -0.0439, "step": 528, "step_time": 5.659446088999175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 193.40625, "completions/mean_terminated_length": 193.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7229057513177395, "epoch": 0.00529, "frac_reward_zero_std": 0.25, "grad_norm": 0.2749546468257904, "kl": 0.5495444275438786, "learning_rate": 9.99997188420711e-06, "loss": -0.0265, "num_tokens": 12052770.0, "reward": 0.07283735275268555, "reward_std": 2.19883131980896, "rewards/rollout_reward_func/mean": 0.07283735275268555, "rewards/rollout_reward_func/std": 3.4002678394317627, "sampling/importance_sampling_ratio/max": 0.97177654504776, "sampling/importance_sampling_ratio/mean": 0.8125364184379578, "sampling/importance_sampling_ratio/min": 0.007580873556435108, "sampling/sampling_logp_difference/max": 2.462484359741211, "sampling/sampling_logp_difference/mean": 0.1155889630317688, "step": 529, "step_time": 10.329194206000466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.735798541456461, "epoch": 0.0053, "grad_norm": 0.2766735255718231, "kl": 0.5347135858610272, "learning_rate": 9.999971770031562e-06, "loss": -0.0268, "step": 530, "step_time": 5.7893383800037554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 229.15625, "completions/mean_terminated_length": 229.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8352021351456642, "epoch": 0.00531, "frac_reward_zero_std": 0.25, "grad_norm": 0.24856433272361755, "kl": 1.0127775287255645, "learning_rate": 9.999971655624656e-06, "loss": -0.033, "num_tokens": 12097078.0, "reward": 2.4010157585144043, "reward_std": 2.634079694747925, "rewards/rollout_reward_func/mean": 2.4010157585144043, "rewards/rollout_reward_func/std": 3.223740816116333, "sampling/importance_sampling_ratio/max": 0.9715428352355957, "sampling/importance_sampling_ratio/mean": 0.7995184659957886, "sampling/importance_sampling_ratio/min": 0.0003441196458879858, "sampling/sampling_logp_difference/max": 3.3831887245178223, "sampling/sampling_logp_difference/mean": 0.12485520541667938, "step": 531, "step_time": 9.895630248001908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.8458217084407806, "epoch": 0.00532, "grad_norm": 0.2463138997554779, "kl": 1.0131473243236542, "learning_rate": 9.999971540986392e-06, "loss": -0.0333, "step": 532, "step_time": 4.70654397600083 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.03125, "completions/max_length": 16.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.40625, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6229059137403965, "epoch": 0.00533, "frac_reward_zero_std": 0.75, "grad_norm": 0.0867200493812561, "kl": 0.301856747828424, "learning_rate": 9.999971426116773e-06, "loss": 0.013, "num_tokens": 12127266.0, "reward": 2.3190371990203857, "reward_std": 0.17416195571422577, "rewards/rollout_reward_func/mean": 2.3190371990203857, "rewards/rollout_reward_func/std": 2.9765655994415283, "sampling/importance_sampling_ratio/max": 0.9953253269195557, "sampling/importance_sampling_ratio/mean": 0.9119924902915955, "sampling/importance_sampling_ratio/min": 9.07139863226547e-10, "sampling/sampling_logp_difference/max": 2.275739908218384, "sampling/sampling_logp_difference/mean": 0.18988946080207825, "step": 533, "step_time": 6.3559545280058956 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 0.6098764445632696, "epoch": 0.00534, "grad_norm": 0.07615851610898972, "kl": 0.29994551092386246, "learning_rate": 9.999971311015795e-06, "loss": 0.0132, "step": 534, "step_time": 3.3522810229987954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 506.59375, "completions/mean_terminated_length": 506.59375, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "entropy": 0.9438053034245968, "epoch": 0.00535, "frac_reward_zero_std": 0.25, "grad_norm": 0.25231853127479553, "kl": 0.9772574175149202, "learning_rate": 9.999971195683457e-06, "loss": -0.0258, "num_tokens": 12183050.0, "reward": 3.9529902935028076, "reward_std": 2.064357280731201, "rewards/rollout_reward_func/mean": 3.9529902935028076, "rewards/rollout_reward_func/std": 2.3567817211151123, "sampling/importance_sampling_ratio/max": 0.9861356616020203, "sampling/importance_sampling_ratio/mean": 0.7216306924819946, "sampling/importance_sampling_ratio/min": 1.0602348896644891e-38, "sampling/sampling_logp_difference/max": 16.101383209228516, "sampling/sampling_logp_difference/mean": 0.44557735323905945, "step": 535, "step_time": 11.464753504998953 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333535119891, "entropy": 0.9648814797401428, "epoch": 0.00536, "grad_norm": 0.2597038149833679, "kl": 0.9780654516071081, "learning_rate": 9.999971080119765e-06, "loss": -0.0268, "step": 536, "step_time": 6.233525958999962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01175213698297739, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01175213698297739, "completions/clipped_ratio": 0.0, "completions/max_length": 368.0, "completions/max_terminated_length": 368.0, "completions/mean_length": 79.25, "completions/mean_terminated_length": 79.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6888117790222168, "epoch": 0.00537, "frac_reward_zero_std": 0.25, "grad_norm": 0.2771075665950775, "kl": 0.3036384480074048, "learning_rate": 9.999970964324714e-06, "loss": -0.0225, "num_tokens": 12219574.0, "reward": 3.667193651199341, "reward_std": 1.4907891750335693, "rewards/rollout_reward_func/mean": 3.667193651199341, "rewards/rollout_reward_func/std": 1.8128677606582642, "sampling/importance_sampling_ratio/max": 0.9926667213439941, "sampling/importance_sampling_ratio/mean": 0.8205742835998535, "sampling/importance_sampling_ratio/min": 0.01018544752150774, "sampling/sampling_logp_difference/max": 2.49147367477417, "sampling/sampling_logp_difference/mean": 0.10545822978019714, "step": 537, "step_time": 7.606352792001417 }, { "clip_ratio/high_max": 0.030448718927800655, "clip_ratio/high_mean": 0.015224359463900328, "clip_ratio/low_mean": 0.01175213698297739, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026976496446877718, "entropy": 0.6304069012403488, "epoch": 0.00538, "grad_norm": 0.2675606608390808, "kl": 0.29207404144108295, "learning_rate": 9.999970848298305e-06, "loss": -0.023, "step": 538, "step_time": 4.031895496000288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 441.8125, "completions/mean_terminated_length": 417.7333679199219, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2835059985518456, "epoch": 0.00539, "frac_reward_zero_std": 0.25, "grad_norm": 0.4927231967449188, "kl": 1.0617645867168903, "learning_rate": 9.999970732040537e-06, "loss": -0.0372, "num_tokens": 12271480.0, "reward": 1.2818849086761475, "reward_std": 2.1209397315979004, "rewards/rollout_reward_func/mean": 1.2818849086761475, "rewards/rollout_reward_func/std": 3.105111837387085, "sampling/importance_sampling_ratio/max": 1.0487773418426514, "sampling/importance_sampling_ratio/mean": 0.615896463394165, "sampling/importance_sampling_ratio/min": 4.664869379666925e-08, "sampling/sampling_logp_difference/max": 3.0516366958618164, "sampling/sampling_logp_difference/mean": 0.24597886204719543, "step": 539, "step_time": 11.404241085994727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01772435917519033, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01772435917519033, "entropy": 1.3082151040434837, "epoch": 0.0054, "grad_norm": 0.44666793942451477, "kl": 1.064925629645586, "learning_rate": 9.999970615551415e-06, "loss": -0.0379, "step": 540, "step_time": 6.395340023000244 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 269.25, "completions/mean_terminated_length": 269.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6478236503899097, "epoch": 0.00541, "frac_reward_zero_std": 0.5, "grad_norm": 0.20808206498622894, "kl": 0.7809935137629509, "learning_rate": 9.999970498830934e-06, "loss": -0.0008, "num_tokens": 12314800.0, "reward": 2.3642802238464355, "reward_std": 0.7152913808822632, "rewards/rollout_reward_func/mean": 2.3642802238464355, "rewards/rollout_reward_func/std": 2.6497058868408203, "sampling/importance_sampling_ratio/max": 0.972186803817749, "sampling/importance_sampling_ratio/mean": 0.818439245223999, "sampling/importance_sampling_ratio/min": 0.048901595175266266, "sampling/sampling_logp_difference/max": 2.274906873703003, "sampling/sampling_logp_difference/mean": 0.08302247524261475, "step": 541, "step_time": 10.359902608000993 }, { "clip_ratio/high_max": 0.02211538515985012, "clip_ratio/high_mean": 0.01105769257992506, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105769257992506, "entropy": 0.6582599058747292, "epoch": 0.00542, "grad_norm": 0.24342259764671326, "kl": 0.794876916334033, "learning_rate": 9.999970381879096e-06, "loss": -0.0011, "step": 542, "step_time": 5.736762195001575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 275.03125, "completions/mean_terminated_length": 275.03125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8344537913799286, "epoch": 0.00543, "frac_reward_zero_std": 0.25, "grad_norm": 0.22780881822109222, "kl": 1.231281753629446, "learning_rate": 9.999970264695899e-06, "loss": -0.0474, "num_tokens": 12359748.0, "reward": 2.70570969581604, "reward_std": 2.3748579025268555, "rewards/rollout_reward_func/mean": 2.70570969581604, "rewards/rollout_reward_func/std": 2.891200065612793, "sampling/importance_sampling_ratio/max": 0.9729215502738953, "sampling/importance_sampling_ratio/mean": 0.7478217482566833, "sampling/importance_sampling_ratio/min": 1.311659503231948e-16, "sampling/sampling_logp_difference/max": 13.467451095581055, "sampling/sampling_logp_difference/mean": 0.2920058071613312, "step": 543, "step_time": 9.926575198996943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.834478035569191, "epoch": 0.00544, "grad_norm": 0.19928063452243805, "kl": 1.211976807564497, "learning_rate": 9.999970147281346e-06, "loss": -0.0474, "step": 544, "step_time": 5.498486933000095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 512.0, "completions/max_terminated_length": 512.0, "completions/mean_length": 193.6875, "completions/mean_terminated_length": 193.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.35144414752721786, "epoch": 0.00545, "frac_reward_zero_std": 0.5, "grad_norm": 0.2317347377538681, "kl": 1.3858313239179552, "learning_rate": 9.999970029635434e-06, "loss": -0.013, "num_tokens": 12401312.0, "reward": 2.251572608947754, "reward_std": 0.7516929507255554, "rewards/rollout_reward_func/mean": 2.251572608947754, "rewards/rollout_reward_func/std": 2.9818456172943115, "sampling/importance_sampling_ratio/max": 0.9710263013839722, "sampling/importance_sampling_ratio/mean": 0.8861041069030762, "sampling/importance_sampling_ratio/min": 0.03854726627469063, "sampling/sampling_logp_difference/max": 3.315427541732788, "sampling/sampling_logp_difference/mean": 0.0625067949295044, "step": 545, "step_time": 7.691441536999264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.34742890670895576, "epoch": 0.00546, "grad_norm": 0.19692310690879822, "kl": 1.3209115751087666, "learning_rate": 9.999969911758165e-06, "loss": -0.0132, "step": 546, "step_time": 4.240186446997541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 311.53125, "completions/mean_terminated_length": 311.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5183061622083187, "epoch": 0.00547, "frac_reward_zero_std": 0.0, "grad_norm": 0.3143036961555481, "kl": 1.2848607823252678, "learning_rate": 9.999969793649539e-06, "loss": -0.016, "num_tokens": 12449206.0, "reward": 3.805889844894409, "reward_std": 1.4891021251678467, "rewards/rollout_reward_func/mean": 3.805889844894409, "rewards/rollout_reward_func/std": 1.823185682296753, "sampling/importance_sampling_ratio/max": 1.0158056020736694, "sampling/importance_sampling_ratio/mean": 0.8602503538131714, "sampling/importance_sampling_ratio/min": 9.287996861816818e-20, "sampling/sampling_logp_difference/max": 18.51479721069336, "sampling/sampling_logp_difference/mean": 0.2735349237918854, "step": 547, "step_time": 10.442409536999548 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.5028149709105492, "epoch": 0.00548, "grad_norm": 0.31013503670692444, "kl": 1.2855870313942432, "learning_rate": 9.999969675309556e-06, "loss": -0.0162, "step": 548, "step_time": 6.8056262800018885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1128.0, "completions/max_terminated_length": 1128.0, "completions/mean_length": 196.28125, "completions/mean_terminated_length": 196.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6495900303125381, "epoch": 0.00549, "frac_reward_zero_std": 0.5, "grad_norm": 0.18790289759635925, "kl": 1.2354748826473951, "learning_rate": 9.999969556738213e-06, "loss": -0.0025, "num_tokens": 12491745.0, "reward": 1.6119921207427979, "reward_std": 1.403378963470459, "rewards/rollout_reward_func/mean": 1.6119921207427979, "rewards/rollout_reward_func/std": 2.9790399074554443, "sampling/importance_sampling_ratio/max": 1.3981798887252808, "sampling/importance_sampling_ratio/mean": 0.8641494512557983, "sampling/importance_sampling_ratio/min": 0.003060899907723069, "sampling/sampling_logp_difference/max": 2.847306251525879, "sampling/sampling_logp_difference/mean": 0.11404507607221603, "step": 549, "step_time": 9.875547307998204 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.64667734131217, "epoch": 0.0055, "grad_norm": 0.16870473325252533, "kl": 1.1996493767946959, "learning_rate": 9.999969437935517e-06, "loss": -0.0027, "step": 550, "step_time": 5.640766619002534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 94.96875, "completions/mean_terminated_length": 94.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.3771507889032364, "epoch": 0.00551, "frac_reward_zero_std": 0.75, "grad_norm": 0.02412821725010872, "kl": 0.8743927022442222, "learning_rate": 9.99996931890146e-06, "loss": -0.0188, "num_tokens": 12528378.0, "reward": 2.282623052597046, "reward_std": 0.11187603324651718, "rewards/rollout_reward_func/mean": 2.282623052597046, "rewards/rollout_reward_func/std": 3.1064114570617676, "sampling/importance_sampling_ratio/max": 1.1392117738723755, "sampling/importance_sampling_ratio/mean": 0.9277955293655396, "sampling/importance_sampling_ratio/min": 0.02686932310461998, "sampling/sampling_logp_difference/max": 1.6674044132232666, "sampling/sampling_logp_difference/mean": 0.04762021452188492, "step": 551, "step_time": 8.61596310999812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.3849210664629936, "epoch": 0.00552, "grad_norm": 0.03131844103336334, "kl": 0.8787051499821246, "learning_rate": 9.999969199636046e-06, "loss": -0.0188, "step": 552, "step_time": 5.156757270999151 }, { "clip_ratio/high_max": 0.004310344811528921, "clip_ratio/high_mean": 0.0021551724057644606, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021551724057644606, "completions/clipped_ratio": 0.03125, "completions/max_length": 1233.0, "completions/max_terminated_length": 1233.0, "completions/mean_length": 481.4375, "completions/mean_terminated_length": 478.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9553285613656044, "epoch": 0.00553, "frac_reward_zero_std": 0.25, "grad_norm": 0.1521061509847641, "kl": 1.0750275254249573, "learning_rate": 9.999969080139276e-06, "loss": -0.0111, "num_tokens": 12581517.0, "reward": 1.175532579421997, "reward_std": 1.9558191299438477, "rewards/rollout_reward_func/mean": 1.175532579421997, "rewards/rollout_reward_func/std": 2.962045192718506, "sampling/importance_sampling_ratio/max": 0.9735164642333984, "sampling/importance_sampling_ratio/mean": 0.6661818623542786, "sampling/importance_sampling_ratio/min": 6.388499306319672e-16, "sampling/sampling_logp_difference/max": 16.94160270690918, "sampling/sampling_logp_difference/mean": 0.28500428795814514, "step": 553, "step_time": 11.419495779002318 }, { "clip_ratio/high_max": 0.004310344811528921, "clip_ratio/high_mean": 0.0021551724057644606, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006619458319619298, "entropy": 0.9598577134311199, "epoch": 0.00554, "grad_norm": 0.17175991833209991, "kl": 1.070671420544386, "learning_rate": 9.999968960411146e-06, "loss": -0.0109, "step": 554, "step_time": 5.967319656003383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0625, "completions/max_length": 1259.0, "completions/max_terminated_length": 1259.0, "completions/mean_length": 298.25, "completions/mean_terminated_length": 289.8333435058594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8600516133010387, "epoch": 0.00555, "frac_reward_zero_std": 0.0, "grad_norm": 0.16168786585330963, "kl": 0.6912454105913639, "learning_rate": 9.999968840451662e-06, "loss": -0.034, "num_tokens": 12629222.0, "reward": 2.4660122394561768, "reward_std": 1.8788249492645264, "rewards/rollout_reward_func/mean": 2.4660122394561768, "rewards/rollout_reward_func/std": 3.2545902729034424, "sampling/importance_sampling_ratio/max": 0.9797153472900391, "sampling/importance_sampling_ratio/mean": 0.7465136051177979, "sampling/importance_sampling_ratio/min": 0.0003600725613068789, "sampling/sampling_logp_difference/max": 2.5274734497070312, "sampling/sampling_logp_difference/mean": 0.12437939643859863, "step": 555, "step_time": 10.49405151800056 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8334559015929699, "epoch": 0.00556, "grad_norm": 0.139252707362175, "kl": 0.6722989156842232, "learning_rate": 9.999968720260818e-06, "loss": -0.0343, "step": 556, "step_time": 6.461286615003701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1285.0, "completions/max_terminated_length": 1285.0, "completions/mean_length": 376.75, "completions/mean_terminated_length": 376.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0095805935561657, "epoch": 0.00557, "frac_reward_zero_std": 0.0, "grad_norm": 0.28813353180885315, "kl": 0.9586856737732887, "learning_rate": 9.999968599838618e-06, "loss": -0.0291, "num_tokens": 12678956.0, "reward": 0.8839707970619202, "reward_std": 3.1894664764404297, "rewards/rollout_reward_func/mean": 0.8839707970619202, "rewards/rollout_reward_func/std": 3.7785799503326416, "sampling/importance_sampling_ratio/max": 0.9651698470115662, "sampling/importance_sampling_ratio/mean": 0.6832971572875977, "sampling/importance_sampling_ratio/min": 0.01402841042727232, "sampling/sampling_logp_difference/max": 2.3923611640930176, "sampling/sampling_logp_difference/mean": 0.1506999433040619, "step": 557, "step_time": 11.011922970999876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.984325036406517, "epoch": 0.00558, "grad_norm": 0.24874983727931976, "kl": 0.9588083885610104, "learning_rate": 9.99996847918506e-06, "loss": -0.0298, "step": 558, "step_time": 5.957736944001226 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 322.3125, "completions/mean_terminated_length": 322.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6299863830208778, "epoch": 0.00559, "frac_reward_zero_std": 0.25, "grad_norm": 0.30285462737083435, "kl": 0.9718420282006264, "learning_rate": 9.999968358300144e-06, "loss": -0.0467, "num_tokens": 12726307.0, "reward": 4.26622200012207, "reward_std": 0.8534752130508423, "rewards/rollout_reward_func/mean": 4.26622200012207, "rewards/rollout_reward_func/std": 1.221349835395813, "sampling/importance_sampling_ratio/max": 0.9765149354934692, "sampling/importance_sampling_ratio/mean": 0.816184401512146, "sampling/importance_sampling_ratio/min": 6.680267216539705e-16, "sampling/sampling_logp_difference/max": 15.1216402053833, "sampling/sampling_logp_difference/mean": 0.25717347860336304, "step": 559, "step_time": 10.451059310998971 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01406249962747097, "entropy": 0.6205399930477142, "epoch": 0.0056, "grad_norm": 0.050988372415304184, "kl": 0.9500163123011589, "learning_rate": 9.999968237183872e-06, "loss": -0.047, "step": 560, "step_time": 6.244169907000469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 277.9375, "completions/mean_terminated_length": 277.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8073817417025566, "epoch": 0.00561, "frac_reward_zero_std": 0.0, "grad_norm": 0.31852659583091736, "kl": 1.2279831301420927, "learning_rate": 9.99996811583624e-06, "loss": -0.0043, "num_tokens": 12773278.0, "reward": 0.8817975521087646, "reward_std": 1.760772943496704, "rewards/rollout_reward_func/mean": 0.8817975521087646, "rewards/rollout_reward_func/std": 3.4354734420776367, "sampling/importance_sampling_ratio/max": 2.3293871879577637, "sampling/importance_sampling_ratio/mean": 0.8160617351531982, "sampling/importance_sampling_ratio/min": 0.049917202442884445, "sampling/sampling_logp_difference/max": 2.6468920707702637, "sampling/sampling_logp_difference/mean": 0.11957312375307083, "step": 561, "step_time": 9.985138273999837 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "entropy": 0.7940157726407051, "epoch": 0.00562, "grad_norm": 0.1936478465795517, "kl": 1.2045081965625286, "learning_rate": 9.999967994257254e-06, "loss": -0.005, "step": 562, "step_time": 5.32173193699964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.022034244146198034, "clip_ratio/low_min": 0.0078125, "clip_ratio/region_mean": 0.022034244146198034, "completions/clipped_ratio": 0.125, "completions/max_length": 1157.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 470.78125, "completions/mean_terminated_length": 457.107177734375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7087375447154045, "epoch": 0.00563, "frac_reward_zero_std": 0.0, "grad_norm": 0.3263857960700989, "kl": 1.0753091238439083, "learning_rate": 9.999967872446908e-06, "loss": -0.0419, "num_tokens": 12827276.0, "reward": 1.3511426448822021, "reward_std": 2.9961955547332764, "rewards/rollout_reward_func/mean": 1.3511426448822021, "rewards/rollout_reward_func/std": 3.393129348754883, "sampling/importance_sampling_ratio/max": 0.9635075330734253, "sampling/importance_sampling_ratio/mean": 0.53351891040802, "sampling/importance_sampling_ratio/min": 4.2429046167280556e-29, "sampling/sampling_logp_difference/max": 18.85893440246582, "sampling/sampling_logp_difference/mean": 0.6766440868377686, "step": 563, "step_time": 10.727449443998921 }, { "clip_ratio/high_max": 0.02211538515985012, "clip_ratio/high_mean": 0.01105769257992506, "clip_ratio/low_mean": 0.029384460300207138, "clip_ratio/low_min": 0.006756756920367479, "clip_ratio/region_mean": 0.04044215311296284, "entropy": 1.731799378991127, "epoch": 0.00564, "grad_norm": 0.257315069437027, "kl": 1.0807871539145708, "learning_rate": 9.999967750405206e-06, "loss": -0.0433, "step": 564, "step_time": 5.732265425000151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 336.59375, "completions/mean_terminated_length": 328.5483703613281, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0954448767006397, "epoch": 0.00565, "frac_reward_zero_std": 0.25, "grad_norm": 0.17554858326911926, "kl": 0.4829954793676734, "learning_rate": 9.999967628132146e-06, "loss": -0.0217, "num_tokens": 12872996.0, "reward": 2.3667819499969482, "reward_std": 2.0026955604553223, "rewards/rollout_reward_func/mean": 2.3667819499969482, "rewards/rollout_reward_func/std": 2.8466920852661133, "sampling/importance_sampling_ratio/max": 0.9737358093261719, "sampling/importance_sampling_ratio/mean": 0.6535273790359497, "sampling/importance_sampling_ratio/min": 0.04292701557278633, "sampling/sampling_logp_difference/max": 2.0137035846710205, "sampling/sampling_logp_difference/mean": 0.15434637665748596, "step": 565, "step_time": 11.129787436000697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 1.1247958466410637, "epoch": 0.00566, "grad_norm": 0.18111670017242432, "kl": 0.4868197739124298, "learning_rate": 9.99996750562773e-06, "loss": -0.0219, "step": 566, "step_time": 5.839115145003234 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013736264314502478, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 468.78125, "completions/mean_terminated_length": 468.78125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1985476985573769, "epoch": 0.00567, "frac_reward_zero_std": 0.0, "grad_norm": 0.28775468468666077, "kl": 1.2231285944581032, "learning_rate": 9.999967382891956e-06, "loss": -0.0287, "num_tokens": 12926919.0, "reward": 2.5682127475738525, "reward_std": 1.9472922086715698, "rewards/rollout_reward_func/mean": 2.5682127475738525, "rewards/rollout_reward_func/std": 3.402878999710083, "sampling/importance_sampling_ratio/max": 0.9668861627578735, "sampling/importance_sampling_ratio/mean": 0.6576176285743713, "sampling/importance_sampling_ratio/min": 0.008804217912256718, "sampling/sampling_logp_difference/max": 2.3818626403808594, "sampling/sampling_logp_difference/mean": 0.16764512658119202, "step": 567, "step_time": 10.261903830001756 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.1869383230805397, "epoch": 0.00568, "grad_norm": 0.27196502685546875, "kl": 1.2166822478175163, "learning_rate": 9.999967259924823e-06, "loss": -0.0293, "step": 568, "step_time": 5.77031275599802 }, { "clip_ratio/high_max": 0.03485576994717121, "clip_ratio/high_mean": 0.017427884973585606, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02223557746037841, "completions/clipped_ratio": 0.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 360.4375, "completions/mean_terminated_length": 360.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7948243096470833, "epoch": 0.00569, "frac_reward_zero_std": 0.5, "grad_norm": 0.10528336465358734, "kl": 0.820365970954299, "learning_rate": 9.999967136726334e-06, "loss": -0.0091, "num_tokens": 12975026.0, "reward": 1.9637326002120972, "reward_std": 1.4593921899795532, "rewards/rollout_reward_func/mean": 1.9637326002120972, "rewards/rollout_reward_func/std": 3.1828789710998535, "sampling/importance_sampling_ratio/max": 0.9734419584274292, "sampling/importance_sampling_ratio/mean": 0.7424527406692505, "sampling/importance_sampling_ratio/min": 0.013174143619835377, "sampling/sampling_logp_difference/max": 2.796687126159668, "sampling/sampling_logp_difference/mean": 0.11625449359416962, "step": 569, "step_time": 10.987578487001883 }, { "clip_ratio/high_max": 0.02704326994717121, "clip_ratio/high_mean": 0.013521634973585606, "clip_ratio/low_mean": 0.017307692673057318, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030829327180981636, "entropy": 0.7993888854980469, "epoch": 0.0057, "grad_norm": 0.13245892524719238, "kl": 0.8425608836114407, "learning_rate": 9.999967013296488e-06, "loss": -0.0091, "step": 570, "step_time": 6.341492899000514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 372.625, "completions/mean_terminated_length": 372.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.873575247824192, "epoch": 0.00571, "frac_reward_zero_std": 0.25, "grad_norm": 0.18955251574516296, "kl": 1.036339346319437, "learning_rate": 9.999966889635285e-06, "loss": -0.0225, "num_tokens": 13022190.0, "reward": 2.3011090755462646, "reward_std": 2.276495933532715, "rewards/rollout_reward_func/mean": 2.3011090755462646, "rewards/rollout_reward_func/std": 3.4028677940368652, "sampling/importance_sampling_ratio/max": 1.0015637874603271, "sampling/importance_sampling_ratio/mean": 0.6921075582504272, "sampling/importance_sampling_ratio/min": 0.08529319614171982, "sampling/sampling_logp_difference/max": 2.272636890411377, "sampling/sampling_logp_difference/mean": 0.1097971647977829, "step": 571, "step_time": 10.763006974000746 }, { "clip_ratio/high_max": 0.01875000074505806, "clip_ratio/high_mean": 0.00937500037252903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00937500037252903, "entropy": 0.8610194027423859, "epoch": 0.00572, "grad_norm": 0.16286644339561462, "kl": 1.0556993074715137, "learning_rate": 9.999966765742724e-06, "loss": -0.023, "step": 572, "step_time": 5.988548399996944 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010146104265004396, "completions/clipped_ratio": 0.0, "completions/max_length": 1351.0, "completions/max_terminated_length": 1351.0, "completions/mean_length": 485.59375, "completions/mean_terminated_length": 485.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4966364726424217, "epoch": 0.00573, "frac_reward_zero_std": 0.0, "grad_norm": 0.2316640019416809, "kl": 1.210843626409769, "learning_rate": 9.999966641618805e-06, "loss": -0.0511, "num_tokens": 13076858.0, "reward": 1.5680395364761353, "reward_std": 2.4333465099334717, "rewards/rollout_reward_func/mean": 1.5680395364761353, "rewards/rollout_reward_func/std": 3.52186918258667, "sampling/importance_sampling_ratio/max": 0.9445250630378723, "sampling/importance_sampling_ratio/mean": 0.542987585067749, "sampling/importance_sampling_ratio/min": 0.005983248818665743, "sampling/sampling_logp_difference/max": 3.3724565505981445, "sampling/sampling_logp_difference/mean": 0.23839926719665527, "step": 573, "step_time": 11.361225430997365 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.013034759555011988, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017499045468866825, "entropy": 1.5177309326827526, "epoch": 0.00574, "grad_norm": 0.2031317800283432, "kl": 1.1720059886574745, "learning_rate": 9.99996651726353e-06, "loss": -0.0519, "step": 574, "step_time": 6.638169648998883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 328.28125, "completions/mean_terminated_length": 328.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0762870274484158, "epoch": 0.00575, "frac_reward_zero_std": 0.25, "grad_norm": 0.21818137168884277, "kl": 1.3727467730641365, "learning_rate": 9.999966392676898e-06, "loss": -0.011, "num_tokens": 13124958.0, "reward": 1.9392286539077759, "reward_std": 1.5780640840530396, "rewards/rollout_reward_func/mean": 1.9392286539077759, "rewards/rollout_reward_func/std": 3.6038105487823486, "sampling/importance_sampling_ratio/max": 1.091375470161438, "sampling/importance_sampling_ratio/mean": 0.6763774752616882, "sampling/importance_sampling_ratio/min": 0.006669826339930296, "sampling/sampling_logp_difference/max": 1.9719045162200928, "sampling/sampling_logp_difference/mean": 0.14898774027824402, "step": 575, "step_time": 9.820724717001212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 1.1058176383376122, "epoch": 0.00576, "grad_norm": 0.21706783771514893, "kl": 1.3816342055797577, "learning_rate": 9.999966267858909e-06, "loss": -0.0109, "step": 576, "step_time": 5.4618569429967465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 231.5625, "completions/mean_terminated_length": 231.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2166098728775978, "epoch": 0.00577, "frac_reward_zero_std": 0.5, "grad_norm": 0.4497041702270508, "kl": 0.8068901784718037, "learning_rate": 9.999966142809561e-06, "loss": -0.014, "num_tokens": 13168008.0, "reward": 1.8914612531661987, "reward_std": 1.1301112174987793, "rewards/rollout_reward_func/mean": 1.8914612531661987, "rewards/rollout_reward_func/std": 3.323964834213257, "sampling/importance_sampling_ratio/max": 1.3570507764816284, "sampling/importance_sampling_ratio/mean": 0.6871007680892944, "sampling/importance_sampling_ratio/min": 0.02672838792204857, "sampling/sampling_logp_difference/max": 2.949693202972412, "sampling/sampling_logp_difference/mean": 0.16215063631534576, "step": 577, "step_time": 10.16327459400236 }, { "clip_ratio/high_max": 0.01666666753590107, "clip_ratio/high_mean": 0.008333333767950535, "clip_ratio/low_mean": 0.02234848542138934, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030681819189339876, "entropy": 1.257919229567051, "epoch": 0.00578, "grad_norm": 0.3247930705547333, "kl": 0.8016318641602993, "learning_rate": 9.999966017528855e-06, "loss": -0.0153, "step": 578, "step_time": 5.239922991997446 }, { "clip_ratio/high_max": 0.03490259870886803, "clip_ratio/high_mean": 0.017451299354434013, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02370129944756627, "completions/clipped_ratio": 0.0625, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 239.125, "completions/mean_terminated_length": 224.1333465576172, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5998980551958084, "epoch": 0.00579, "frac_reward_zero_std": 0.5, "grad_norm": 0.14758744835853577, "kl": 0.5547509081661701, "learning_rate": 9.999965892016794e-06, "loss": -0.0114, "num_tokens": 13210831.0, "reward": 1.9742183685302734, "reward_std": 1.7895667552947998, "rewards/rollout_reward_func/mean": 1.9742183685302734, "rewards/rollout_reward_func/std": 3.2008402347564697, "sampling/importance_sampling_ratio/max": 0.9731432795524597, "sampling/importance_sampling_ratio/mean": 0.5654269456863403, "sampling/importance_sampling_ratio/min": 0.00012347502342890948, "sampling/sampling_logp_difference/max": 2.6402876377105713, "sampling/sampling_logp_difference/mean": 0.2969970703125, "step": 579, "step_time": 10.377765658002318 }, { "clip_ratio/high_max": 0.04077381081879139, "clip_ratio/high_mean": 0.023227814119309187, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023227814119309187, "entropy": 1.5784113109111786, "epoch": 0.0058, "grad_norm": 0.17188747227191925, "kl": 0.5702024418860674, "learning_rate": 9.999965766273377e-06, "loss": -0.0115, "step": 580, "step_time": 5.558220601000357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 251.0625, "completions/mean_terminated_length": 251.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9776499271392822, "epoch": 0.00581, "frac_reward_zero_std": 0.5, "grad_norm": 0.17754018306732178, "kl": 0.8899655751883984, "learning_rate": 9.999965640298598e-06, "loss": -0.0297, "num_tokens": 13256064.0, "reward": 2.608126163482666, "reward_std": 1.1168774366378784, "rewards/rollout_reward_func/mean": 2.608126163482666, "rewards/rollout_reward_func/std": 3.1544578075408936, "sampling/importance_sampling_ratio/max": 0.9710748791694641, "sampling/importance_sampling_ratio/mean": 0.737217903137207, "sampling/importance_sampling_ratio/min": 0.0741211324930191, "sampling/sampling_logp_difference/max": 2.1700329780578613, "sampling/sampling_logp_difference/mean": 0.11015582084655762, "step": 581, "step_time": 9.941104672996516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.966766469180584, "epoch": 0.00582, "grad_norm": 0.1880107969045639, "kl": 0.8740267753601074, "learning_rate": 9.999965514092466e-06, "loss": -0.0299, "step": 582, "step_time": 5.5907416050031316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 246.28125, "completions/mean_terminated_length": 246.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.887239009141922, "epoch": 0.00583, "frac_reward_zero_std": 0.0, "grad_norm": 0.3278046250343323, "kl": 0.9741146098822355, "learning_rate": 9.999965387654976e-06, "loss": -0.0209, "num_tokens": 13301081.0, "reward": 1.1792426109313965, "reward_std": 2.5587568283081055, "rewards/rollout_reward_func/mean": 1.1792426109313965, "rewards/rollout_reward_func/std": 3.6852760314941406, "sampling/importance_sampling_ratio/max": 0.9642409682273865, "sampling/importance_sampling_ratio/mean": 0.7255679965019226, "sampling/importance_sampling_ratio/min": 0.13785836100578308, "sampling/sampling_logp_difference/max": 1.8071315288543701, "sampling/sampling_logp_difference/mean": 0.11473817378282547, "step": 583, "step_time": 10.00929495499986 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.8666587322950363, "epoch": 0.00584, "grad_norm": 0.5423286557197571, "kl": 0.9731207191944122, "learning_rate": 9.999965260986127e-06, "loss": -0.0208, "step": 584, "step_time": 5.50647013300113 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.015224359463900328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02043269295245409, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 422.375, "completions/mean_terminated_length": 422.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4304271042346954, "epoch": 0.00585, "frac_reward_zero_std": 0.0, "grad_norm": 0.3240177035331726, "kl": 1.142060860991478, "learning_rate": 9.999965134085922e-06, "loss": -0.0662, "num_tokens": 13353690.0, "reward": 1.4809253215789795, "reward_std": 3.5300211906433105, "rewards/rollout_reward_func/mean": 1.4809253215789795, "rewards/rollout_reward_func/std": 3.810992956161499, "sampling/importance_sampling_ratio/max": 1.2285709381103516, "sampling/importance_sampling_ratio/mean": 0.5619844794273376, "sampling/importance_sampling_ratio/min": 0.0029045664705336094, "sampling/sampling_logp_difference/max": 2.846268653869629, "sampling/sampling_logp_difference/mean": 0.20450648665428162, "step": 585, "step_time": 10.63744953999958 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.3919290862977505, "epoch": 0.00586, "grad_norm": 0.2462041974067688, "kl": 1.1001071855425835, "learning_rate": 9.999965006954359e-06, "loss": -0.067, "step": 586, "step_time": 6.3983265579972795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 518.71875, "completions/mean_terminated_length": 523.4838256835938, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "entropy": 0.8989866673946381, "epoch": 0.00587, "frac_reward_zero_std": 0.0, "grad_norm": 2.006312608718872, "kl": 1.3050077185034752, "learning_rate": 9.999964879591441e-06, "loss": 0.0042, "num_tokens": 13409373.0, "reward": 2.5675315856933594, "reward_std": 1.6634793281555176, "rewards/rollout_reward_func/mean": 2.5675315856933594, "rewards/rollout_reward_func/std": 3.091336727142334, "sampling/importance_sampling_ratio/max": 1.0557633638381958, "sampling/importance_sampling_ratio/mean": 0.6753652095794678, "sampling/importance_sampling_ratio/min": 0.012616770341992378, "sampling/sampling_logp_difference/max": 2.157594919204712, "sampling/sampling_logp_difference/mean": 0.12069234251976013, "step": 587, "step_time": 10.394444302999545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.8718730211257935, "epoch": 0.00588, "grad_norm": 0.4928179681301117, "kl": 1.4574506878852844, "learning_rate": 9.999964751997164e-06, "loss": 0.0014, "step": 588, "step_time": 5.485576205999678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 601.28125, "completions/mean_terminated_length": 601.28125, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 1.0036451667547226, "epoch": 0.00589, "frac_reward_zero_std": 0.0, "grad_norm": 0.26321664452552795, "kl": 1.0629599764943123, "learning_rate": 9.99996462417153e-06, "loss": -0.0354, "num_tokens": 13468227.0, "reward": 1.256040334701538, "reward_std": 2.981733798980713, "rewards/rollout_reward_func/mean": 1.256040334701538, "rewards/rollout_reward_func/std": 3.341505289077759, "sampling/importance_sampling_ratio/max": 0.9502620100975037, "sampling/importance_sampling_ratio/mean": 0.6047900915145874, "sampling/importance_sampling_ratio/min": 0.07536133378744125, "sampling/sampling_logp_difference/max": 2.3709089756011963, "sampling/sampling_logp_difference/mean": 0.13436472415924072, "step": 589, "step_time": 11.18750996700146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.9917861074209213, "epoch": 0.0059, "grad_norm": 0.2710956037044525, "kl": 1.0071097016334534, "learning_rate": 9.99996449611454e-06, "loss": -0.0361, "step": 590, "step_time": 6.655320740999741 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.03125, "completions/max_length": 586.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 156.25, "completions/mean_terminated_length": 142.3870849609375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5769947096705437, "epoch": 0.00591, "frac_reward_zero_std": 0.5, "grad_norm": 0.2075062245130539, "kl": 0.7070829924196005, "learning_rate": 9.99996436782619e-06, "loss": -0.0075, "num_tokens": 13508867.0, "reward": 1.1608023643493652, "reward_std": 0.9846556782722473, "rewards/rollout_reward_func/mean": 1.1608023643493652, "rewards/rollout_reward_func/std": 3.059985876083374, "sampling/importance_sampling_ratio/max": 1.1250008344650269, "sampling/importance_sampling_ratio/mean": 0.8567406535148621, "sampling/importance_sampling_ratio/min": 0.12207645922899246, "sampling/sampling_logp_difference/max": 1.1718416213989258, "sampling/sampling_logp_difference/mean": 0.06053457409143448, "step": 591, "step_time": 8.45491860599941 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 0.5910933762788773, "epoch": 0.00592, "grad_norm": 0.18798227608203888, "kl": 0.7079349290579557, "learning_rate": 9.999964239306485e-06, "loss": -0.008, "step": 592, "step_time": 4.405803242003458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 400.71875, "completions/mean_terminated_length": 400.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7834379337728024, "epoch": 0.00593, "frac_reward_zero_std": 0.25, "grad_norm": 0.4114716947078705, "kl": 0.996284481137991, "learning_rate": 9.999964110555425e-06, "loss": -0.0262, "num_tokens": 13558938.0, "reward": 2.33146333694458, "reward_std": 2.437620162963867, "rewards/rollout_reward_func/mean": 2.33146333694458, "rewards/rollout_reward_func/std": 3.00563907623291, "sampling/importance_sampling_ratio/max": 1.5149747133255005, "sampling/importance_sampling_ratio/mean": 0.7380173802375793, "sampling/importance_sampling_ratio/min": 0.04286710545420647, "sampling/sampling_logp_difference/max": 2.13527512550354, "sampling/sampling_logp_difference/mean": 0.121505007147789, "step": 593, "step_time": 10.53681524300373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7921121716499329, "epoch": 0.00594, "grad_norm": 0.36626818776130676, "kl": 0.9929966628551483, "learning_rate": 9.999963981573004e-06, "loss": -0.0276, "step": 594, "step_time": 6.275149566001346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 729.0, "completions/max_terminated_length": 729.0, "completions/mean_length": 319.875, "completions/mean_terminated_length": 319.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8693666644394398, "epoch": 0.00595, "frac_reward_zero_std": 0.0, "grad_norm": 1.6256762742996216, "kl": 2.3492750748991966, "learning_rate": 9.999963852359229e-06, "loss": -0.0538, "num_tokens": 13606656.0, "reward": 3.3901143074035645, "reward_std": 2.2746353149414062, "rewards/rollout_reward_func/mean": 3.3901143074035645, "rewards/rollout_reward_func/std": 2.5446417331695557, "sampling/importance_sampling_ratio/max": 1.0549389123916626, "sampling/importance_sampling_ratio/mean": 0.742931067943573, "sampling/importance_sampling_ratio/min": 7.828411467807712e-20, "sampling/sampling_logp_difference/max": 16.34896469116211, "sampling/sampling_logp_difference/mean": 0.3142326772212982, "step": 595, "step_time": 8.963649682000323 }, { "clip_ratio/high_max": 0.020292208530008793, "clip_ratio/high_mean": 0.010146104265004396, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020562771009281278, "entropy": 0.8667136430740356, "epoch": 0.00596, "grad_norm": 0.3782597482204437, "kl": 1.4399438872933388, "learning_rate": 9.999963722914094e-06, "loss": -0.0581, "step": 596, "step_time": 5.21561965300134 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0625, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 299.71875, "completions/mean_terminated_length": 284.3000183105469, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9797795861959457, "epoch": 0.00597, "frac_reward_zero_std": 0.0, "grad_norm": 0.34276890754699707, "kl": 1.0586802773177624, "learning_rate": 9.999963593237605e-06, "loss": -0.0593, "num_tokens": 13652901.0, "reward": 1.649234652519226, "reward_std": 2.146238088607788, "rewards/rollout_reward_func/mean": 1.649234652519226, "rewards/rollout_reward_func/std": 3.4937026500701904, "sampling/importance_sampling_ratio/max": 1.1079407930374146, "sampling/importance_sampling_ratio/mean": 0.6694793701171875, "sampling/importance_sampling_ratio/min": 0.025805186480283737, "sampling/sampling_logp_difference/max": 2.687729835510254, "sampling/sampling_logp_difference/mean": 0.12647360563278198, "step": 597, "step_time": 9.170279725 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013494318351149559, "entropy": 1.034283809363842, "epoch": 0.00598, "grad_norm": 0.325570285320282, "kl": 1.042702555656433, "learning_rate": 9.999963463329756e-06, "loss": -0.0603, "step": 598, "step_time": 5.130531893002626 }, { "clip_ratio/high_max": 0.020292208530008793, "clip_ratio/high_mean": 0.010146104265004396, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014052354265004396, "completions/clipped_ratio": 0.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 329.21875, "completions/mean_terminated_length": 329.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5775939151644707, "epoch": 0.00599, "frac_reward_zero_std": 0.0, "grad_norm": 0.3806794285774231, "kl": 1.2741290964186192, "learning_rate": 9.99996333319055e-06, "loss": -0.0371, "num_tokens": 13699654.0, "reward": 1.4827797412872314, "reward_std": 2.4253482818603516, "rewards/rollout_reward_func/mean": 1.4827797412872314, "rewards/rollout_reward_func/std": 3.5282559394836426, "sampling/importance_sampling_ratio/max": 0.9717879891395569, "sampling/importance_sampling_ratio/mean": 0.5862716436386108, "sampling/importance_sampling_ratio/min": 0.002158098155632615, "sampling/sampling_logp_difference/max": 3.153642177581787, "sampling/sampling_logp_difference/mean": 0.24169734120368958, "step": 599, "step_time": 10.21698569700311 }, { "clip_ratio/high_max": 0.03720238246023655, "clip_ratio/high_mean": 0.024283009581267834, "clip_ratio/low_mean": 0.020004735328257084, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04428774397820234, "entropy": 1.5886806473135948, "epoch": 0.006, "grad_norm": 0.3525286316871643, "kl": 1.2521369718015194, "learning_rate": 9.999963202819989e-06, "loss": -0.038, "step": 600, "step_time": 5.804941350001172 }, { "clip_ratio/high_max": 0.01854395680129528, "clip_ratio/high_mean": 0.00927197840064764, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00927197840064764, "completions/clipped_ratio": 0.0625, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 399.28125, "completions/mean_terminated_length": 377.4666748046875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6173338741064072, "epoch": 0.00601, "frac_reward_zero_std": 0.25, "grad_norm": 0.24199911952018738, "kl": 1.408978471532464, "learning_rate": 9.999963072218071e-06, "loss": -0.0031, "num_tokens": 13750522.0, "reward": 1.778174877166748, "reward_std": 1.8821386098861694, "rewards/rollout_reward_func/mean": 1.778174877166748, "rewards/rollout_reward_func/std": 3.1761810779571533, "sampling/importance_sampling_ratio/max": 1.0341734886169434, "sampling/importance_sampling_ratio/mean": 0.525891900062561, "sampling/importance_sampling_ratio/min": 8.369654125317538e-08, "sampling/sampling_logp_difference/max": 8.868558883666992, "sampling/sampling_logp_difference/mean": 0.2713455557823181, "step": 601, "step_time": 10.622783120001259 }, { "clip_ratio/high_max": 0.01854395680129528, "clip_ratio/high_mean": 0.00927197840064764, "clip_ratio/low_mean": 0.018526785541325808, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.027798763941973448, "entropy": 1.6344863325357437, "epoch": 0.00602, "grad_norm": 0.17335963249206543, "kl": 1.3953926768153906, "learning_rate": 9.999962941384794e-06, "loss": -0.0037, "step": 602, "step_time": 5.803674230000979 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.036117789801210165, "clip_ratio/low_min": 0.014423076994717121, "clip_ratio/region_mean": 0.042696737218648195, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 456.5625, "completions/mean_terminated_length": 456.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9303210973739624, "epoch": 0.00603, "frac_reward_zero_std": 0.0, "grad_norm": 0.6812057495117188, "kl": 1.009556271135807, "learning_rate": 9.999962810320161e-06, "loss": -0.0251, "num_tokens": 13802186.0, "reward": -0.8246719241142273, "reward_std": 1.95017671585083, "rewards/rollout_reward_func/mean": -0.8246719241142273, "rewards/rollout_reward_func/std": 2.8833954334259033, "sampling/importance_sampling_ratio/max": 0.9663796424865723, "sampling/importance_sampling_ratio/mean": 0.41272103786468506, "sampling/importance_sampling_ratio/min": 8.545434189910148e-15, "sampling/sampling_logp_difference/max": 14.32408332824707, "sampling/sampling_logp_difference/mean": 0.40167224407196045, "step": 603, "step_time": 12.616871852002078 }, { "clip_ratio/high_max": 0.038157896138727665, "clip_ratio/high_mean": 0.019078948069363832, "clip_ratio/low_mean": 0.044130610302090645, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06320955837145448, "entropy": 1.935618594288826, "epoch": 0.00604, "grad_norm": 0.24386823177337646, "kl": 0.8757432401180267, "learning_rate": 9.99996267902417e-06, "loss": -0.0276, "step": 604, "step_time": 6.604218295997271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 426.9375, "completions/mean_terminated_length": 426.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8641587272286415, "epoch": 0.00605, "frac_reward_zero_std": 0.0, "grad_norm": 0.11909927427768707, "kl": 1.0592916831374168, "learning_rate": 9.999962547496824e-06, "loss": 0.0194, "num_tokens": 13854657.0, "reward": 2.785393714904785, "reward_std": 1.018348217010498, "rewards/rollout_reward_func/mean": 2.785393714904785, "rewards/rollout_reward_func/std": 3.147953987121582, "sampling/importance_sampling_ratio/max": 0.9456256031990051, "sampling/importance_sampling_ratio/mean": 0.7128843069076538, "sampling/importance_sampling_ratio/min": 2.3731250621494837e-05, "sampling/sampling_logp_difference/max": 2.085736036300659, "sampling/sampling_logp_difference/mean": 0.15107056498527527, "step": 605, "step_time": 10.159318956004427 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 0.8671917133033276, "epoch": 0.00606, "grad_norm": 0.1890491098165512, "kl": 1.0677382536232471, "learning_rate": 9.99996241573812e-06, "loss": 0.0192, "step": 606, "step_time": 5.575266787003784 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 377.1875, "completions/mean_terminated_length": 377.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6558318175375462, "epoch": 0.00607, "frac_reward_zero_std": 0.0, "grad_norm": 0.33814653754234314, "kl": 1.0723159424960613, "learning_rate": 9.999962283748057e-06, "loss": -0.0229, "num_tokens": 13905433.0, "reward": 2.3951754570007324, "reward_std": 2.4390506744384766, "rewards/rollout_reward_func/mean": 2.3951754570007324, "rewards/rollout_reward_func/std": 3.083226442337036, "sampling/importance_sampling_ratio/max": 0.9695119261741638, "sampling/importance_sampling_ratio/mean": 0.7873752117156982, "sampling/importance_sampling_ratio/min": 0.050945453345775604, "sampling/sampling_logp_difference/max": 1.6577547788619995, "sampling/sampling_logp_difference/mean": 0.07914172112941742, "step": 607, "step_time": 9.890208790999168 }, { "clip_ratio/high_max": 0.031746032647788525, "clip_ratio/high_mean": 0.015873016323894262, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015873016323894262, "entropy": 0.6756963059306145, "epoch": 0.00608, "grad_norm": 0.23984971642494202, "kl": 1.0748817510902882, "learning_rate": 9.999962151526639e-06, "loss": -0.024, "step": 608, "step_time": 5.003469583996775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 326.75, "completions/mean_terminated_length": 326.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8293604888021946, "epoch": 0.00609, "frac_reward_zero_std": 0.5, "grad_norm": 0.1849093735218048, "kl": 0.617501774802804, "learning_rate": 9.999962019073862e-06, "loss": 0.0044, "num_tokens": 13949627.0, "reward": 1.7964401245117188, "reward_std": 1.2776362895965576, "rewards/rollout_reward_func/mean": 1.7964401245117188, "rewards/rollout_reward_func/std": 2.8726868629455566, "sampling/importance_sampling_ratio/max": 0.9737984538078308, "sampling/importance_sampling_ratio/mean": 0.7691824436187744, "sampling/importance_sampling_ratio/min": 0.0012937560677528381, "sampling/sampling_logp_difference/max": 1.9581303596496582, "sampling/sampling_logp_difference/mean": 0.10533257573843002, "step": 609, "step_time": 11.317982786998982 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008097166195511818, "entropy": 0.8697840869426727, "epoch": 0.0061, "grad_norm": 0.18923480808734894, "kl": 0.6138432007282972, "learning_rate": 9.999961886389731e-06, "loss": 0.0036, "step": 610, "step_time": 5.763494528000592 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 215.3125, "completions/mean_terminated_length": 215.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5840349867939949, "epoch": 0.00611, "frac_reward_zero_std": 0.5, "grad_norm": 0.19868458807468414, "kl": 1.064110835082829, "learning_rate": 9.999961753474244e-06, "loss": 0.0066, "num_tokens": 13990733.0, "reward": 4.45039701461792, "reward_std": 0.12824338674545288, "rewards/rollout_reward_func/mean": 4.45039701461792, "rewards/rollout_reward_func/std": 0.5007718801498413, "sampling/importance_sampling_ratio/max": 1.2976516485214233, "sampling/importance_sampling_ratio/mean": 0.8608546257019043, "sampling/importance_sampling_ratio/min": 0.21005511283874512, "sampling/sampling_logp_difference/max": 1.767249345779419, "sampling/sampling_logp_difference/mean": 0.07131876051425934, "step": 611, "step_time": 11.05170112199994 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.6407817155122757, "epoch": 0.00612, "grad_norm": 0.183018758893013, "kl": 1.0669329101219773, "learning_rate": 9.999961620327397e-06, "loss": 0.0063, "step": 612, "step_time": 5.928435346002516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1243.0, "completions/max_terminated_length": 1243.0, "completions/mean_length": 219.0, "completions/mean_terminated_length": 219.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7891772836446762, "epoch": 0.00613, "frac_reward_zero_std": 0.25, "grad_norm": 0.15402358770370483, "kl": 0.8806322365999222, "learning_rate": 9.999961486949193e-06, "loss": -0.0224, "num_tokens": 14031412.0, "reward": 0.710405707359314, "reward_std": 2.0941386222839355, "rewards/rollout_reward_func/mean": 0.710405707359314, "rewards/rollout_reward_func/std": 3.299563407897949, "sampling/importance_sampling_ratio/max": 0.9728391170501709, "sampling/importance_sampling_ratio/mean": 0.5152851343154907, "sampling/importance_sampling_ratio/min": 0.0005672583938576281, "sampling/sampling_logp_difference/max": 2.6221628189086914, "sampling/sampling_logp_difference/mean": 0.2546249032020569, "step": 613, "step_time": 11.16590789899783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020217803539708257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020217803539708257, "entropy": 1.8108076304197311, "epoch": 0.00614, "grad_norm": 0.15904659032821655, "kl": 0.9099494256079197, "learning_rate": 9.999961353339632e-06, "loss": -0.0223, "step": 614, "step_time": 5.870229932001166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.03125, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 726.15625, "completions/mean_terminated_length": 725.4193115234375, "completions/min_length": 402.0, "completions/min_terminated_length": 402.0, "entropy": 1.3902777284383774, "epoch": 0.00615, "frac_reward_zero_std": 0.0, "grad_norm": 0.295984148979187, "kl": 0.9497066512703896, "learning_rate": 9.999961219498714e-06, "loss": -0.0225, "num_tokens": 14093635.0, "reward": 2.0338492393493652, "reward_std": 2.516486883163452, "rewards/rollout_reward_func/mean": 2.0338492393493652, "rewards/rollout_reward_func/std": 3.2513930797576904, "sampling/importance_sampling_ratio/max": 0.9437042474746704, "sampling/importance_sampling_ratio/mean": 0.478537917137146, "sampling/importance_sampling_ratio/min": 1.4851340372334237e-13, "sampling/sampling_logp_difference/max": 11.901616096496582, "sampling/sampling_logp_difference/mean": 0.2837095558643341, "step": 615, "step_time": 11.343585886996152 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.007211538730189204, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01111778849735856, "entropy": 1.3842445239424706, "epoch": 0.00616, "grad_norm": 0.26325491070747375, "kl": 0.9370283260941505, "learning_rate": 9.999961085426441e-06, "loss": -0.0225, "step": 616, "step_time": 5.859756915995604 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 372.625, "completions/mean_terminated_length": 372.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3553109243512154, "epoch": 0.00617, "frac_reward_zero_std": 0.25, "grad_norm": 0.22005431354045868, "kl": 1.397843074053526, "learning_rate": 9.99996095112281e-06, "loss": -0.0529, "num_tokens": 14144205.0, "reward": 1.1742334365844727, "reward_std": 2.540992259979248, "rewards/rollout_reward_func/mean": 1.1742334365844727, "rewards/rollout_reward_func/std": 3.7357077598571777, "sampling/importance_sampling_ratio/max": 0.9767733812332153, "sampling/importance_sampling_ratio/mean": 0.6038128137588501, "sampling/importance_sampling_ratio/min": 0.0033615517895668745, "sampling/sampling_logp_difference/max": 3.276549816131592, "sampling/sampling_logp_difference/mean": 0.1811521202325821, "step": 617, "step_time": 10.855536543998824 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.015178571920841932, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024793956894427538, "entropy": 1.3587193563580513, "epoch": 0.00618, "grad_norm": 0.1793127954006195, "kl": 1.3988219127058983, "learning_rate": 9.999960816587823e-06, "loss": -0.0536, "step": 618, "step_time": 5.668675213000824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.029861111659556627, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.029861111659556627, "completions/clipped_ratio": 0.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 277.25, "completions/mean_terminated_length": 277.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5231379196047783, "epoch": 0.00619, "frac_reward_zero_std": 0.25, "grad_norm": 0.44033652544021606, "kl": 1.0725789070129395, "learning_rate": 9.999960681821476e-06, "loss": -0.0098, "num_tokens": 14188235.0, "reward": 1.3513355255126953, "reward_std": 2.4040334224700928, "rewards/rollout_reward_func/mean": 1.3513355255126953, "rewards/rollout_reward_func/std": 3.2658941745758057, "sampling/importance_sampling_ratio/max": 0.9726054668426514, "sampling/importance_sampling_ratio/mean": 0.5824894905090332, "sampling/importance_sampling_ratio/min": 0.05767630413174629, "sampling/sampling_logp_difference/max": 2.6129279136657715, "sampling/sampling_logp_difference/mean": 0.20279082655906677, "step": 619, "step_time": 9.609497685998576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "entropy": 1.5225640684366226, "epoch": 0.0062, "grad_norm": 0.41600844264030457, "kl": 1.076061025261879, "learning_rate": 9.999960546823774e-06, "loss": -0.0105, "step": 620, "step_time": 5.750868601002367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 259.96875, "completions/mean_terminated_length": 259.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7832685336470604, "epoch": 0.00621, "frac_reward_zero_std": 0.0, "grad_norm": 0.27419182658195496, "kl": 1.374557737261057, "learning_rate": 9.999960411594717e-06, "loss": -0.0248, "num_tokens": 14233513.0, "reward": 1.0645502805709839, "reward_std": 2.6695666313171387, "rewards/rollout_reward_func/mean": 1.0645502805709839, "rewards/rollout_reward_func/std": 3.3028533458709717, "sampling/importance_sampling_ratio/max": 1.030534267425537, "sampling/importance_sampling_ratio/mean": 0.8130946159362793, "sampling/importance_sampling_ratio/min": 0.04394271597266197, "sampling/sampling_logp_difference/max": 1.901428461074829, "sampling/sampling_logp_difference/mean": 0.10147994756698608, "step": 621, "step_time": 10.289893906998259 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01175213698297739, "entropy": 0.7599999532103539, "epoch": 0.00622, "grad_norm": 0.19239917397499084, "kl": 1.3693112879991531, "learning_rate": 9.9999602761343e-06, "loss": -0.0251, "step": 622, "step_time": 5.483066981003503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 1242.0, "completions/max_terminated_length": 1242.0, "completions/mean_length": 246.9375, "completions/mean_terminated_length": 246.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.606779545545578, "epoch": 0.00623, "frac_reward_zero_std": 0.5, "grad_norm": 0.08534824103116989, "kl": 1.1909367591142654, "learning_rate": 9.999960140442528e-06, "loss": -0.0052, "num_tokens": 14275269.0, "reward": 2.8308348655700684, "reward_std": 0.05606721714138985, "rewards/rollout_reward_func/mean": 2.8308348655700684, "rewards/rollout_reward_func/std": 2.6080596446990967, "sampling/importance_sampling_ratio/max": 0.9734424948692322, "sampling/importance_sampling_ratio/mean": 0.8347679376602173, "sampling/importance_sampling_ratio/min": 0.186862513422966, "sampling/sampling_logp_difference/max": 1.5110805034637451, "sampling/sampling_logp_difference/mean": 0.06078411638736725, "step": 623, "step_time": 10.480294804998266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6046528685837984, "epoch": 0.00624, "grad_norm": 0.08682138472795486, "kl": 1.190664113033563, "learning_rate": 9.999960004519398e-06, "loss": -0.0051, "step": 624, "step_time": 6.42876906699712 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 306.46875, "completions/mean_terminated_length": 306.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9886305816471577, "epoch": 0.00625, "frac_reward_zero_std": 0.0, "grad_norm": 0.7932879328727722, "kl": 0.8141480572521687, "learning_rate": 9.999959868364912e-06, "loss": -0.0326, "num_tokens": 14323135.0, "reward": 1.333662509918213, "reward_std": 2.2111451625823975, "rewards/rollout_reward_func/mean": 1.333662509918213, "rewards/rollout_reward_func/std": 3.7581756114959717, "sampling/importance_sampling_ratio/max": 1.3131448030471802, "sampling/importance_sampling_ratio/mean": 0.7386767268180847, "sampling/importance_sampling_ratio/min": 0.03610260784626007, "sampling/sampling_logp_difference/max": 1.7932143211364746, "sampling/sampling_logp_difference/mean": 0.13017217814922333, "step": 625, "step_time": 10.575842488997296 }, { "clip_ratio/high_max": 0.053125000558793545, "clip_ratio/high_mean": 0.026562500279396772, "clip_ratio/low_mean": 0.024479167070239782, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.05104166688397527, "entropy": 0.982837624847889, "epoch": 0.00626, "grad_norm": 0.23012681305408478, "kl": 0.8066022880375385, "learning_rate": 9.99995973197907e-06, "loss": -0.0367, "step": 626, "step_time": 6.154285369002537 }, { "clip_ratio/high_max": 0.02003205195069313, "clip_ratio/high_mean": 0.010016025975346565, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014823718462139368, "completions/clipped_ratio": 0.03125, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 706.84375, "completions/mean_terminated_length": 696.741943359375, "completions/min_length": 322.0, "completions/min_terminated_length": 322.0, "entropy": 1.5190394222736359, "epoch": 0.00627, "frac_reward_zero_std": 0.0, "grad_norm": 0.19023339450359344, "kl": 1.2471973896026611, "learning_rate": 9.99995959536187e-06, "loss": -0.0598, "num_tokens": 14385036.0, "reward": 2.819486141204834, "reward_std": 2.791869878768921, "rewards/rollout_reward_func/mean": 2.819486141204834, "rewards/rollout_reward_func/std": 2.8635826110839844, "sampling/importance_sampling_ratio/max": 0.9419726729393005, "sampling/importance_sampling_ratio/mean": 0.4701436758041382, "sampling/importance_sampling_ratio/min": 0.01716926507651806, "sampling/sampling_logp_difference/max": 2.5739617347717285, "sampling/sampling_logp_difference/mean": 0.19964450597763062, "step": 627, "step_time": 11.71903872099756 }, { "clip_ratio/high_max": 0.031250000931322575, "clip_ratio/high_mean": 0.015625000465661287, "clip_ratio/low_mean": 0.015224359463900328, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.030849359929561615, "entropy": 1.5349342375993729, "epoch": 0.00628, "grad_norm": 0.17624539136886597, "kl": 1.2440728098154068, "learning_rate": 9.999959458513314e-06, "loss": -0.0602, "step": 628, "step_time": 6.837878299000295 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0018382353009656072, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008088235394097865, "completions/clipped_ratio": 0.0, "completions/max_length": 1007.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 518.71875, "completions/mean_terminated_length": 518.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1142145171761513, "epoch": 0.00629, "frac_reward_zero_std": 0.0, "grad_norm": 0.3003154397010803, "kl": 1.3782563414424658, "learning_rate": 9.9999593214334e-06, "loss": -0.0572, "num_tokens": 14441524.0, "reward": 2.664752960205078, "reward_std": 2.97743558883667, "rewards/rollout_reward_func/mean": 2.664752960205078, "rewards/rollout_reward_func/std": 3.294696807861328, "sampling/importance_sampling_ratio/max": 0.9668473601341248, "sampling/importance_sampling_ratio/mean": 0.6224624514579773, "sampling/importance_sampling_ratio/min": 1.8064288487598395e-20, "sampling/sampling_logp_difference/max": 17.298194885253906, "sampling/sampling_logp_difference/mean": 0.3072396516799927, "step": 629, "step_time": 10.586349574998167 }, { "clip_ratio/high_max": 0.037500000558793545, "clip_ratio/high_mean": 0.018750000279396772, "clip_ratio/low_mean": 0.009926470695063472, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028676470974460244, "entropy": 1.1114652529358864, "epoch": 0.0063, "grad_norm": 0.20465491712093353, "kl": 1.3794573098421097, "learning_rate": 9.999959184122127e-06, "loss": -0.0582, "step": 630, "step_time": 6.214844338996045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 327.3125, "completions/mean_terminated_length": 327.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0050673708319664, "epoch": 0.00631, "frac_reward_zero_std": 0.25, "grad_norm": 2.3887104988098145, "kl": 1.905240684747696, "learning_rate": 9.9999590465795e-06, "loss": -0.0337, "num_tokens": 14487410.0, "reward": 1.3303648233413696, "reward_std": 1.4977093935012817, "rewards/rollout_reward_func/mean": 1.3303648233413696, "rewards/rollout_reward_func/std": 3.326721429824829, "sampling/importance_sampling_ratio/max": 1.0008097887039185, "sampling/importance_sampling_ratio/mean": 0.6959946155548096, "sampling/importance_sampling_ratio/min": 5.201548013925722e-15, "sampling/sampling_logp_difference/max": 11.365747451782227, "sampling/sampling_logp_difference/mean": 0.2577168941497803, "step": 631, "step_time": 10.773512987998402 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0020833334419876337, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014583333628252149, "entropy": 0.9942825399339199, "epoch": 0.00632, "grad_norm": 0.20413190126419067, "kl": 1.2100132182240486, "learning_rate": 9.999958908805518e-06, "loss": -0.0397, "step": 632, "step_time": 5.871666302004087 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1387.0, "completions/max_terminated_length": 1387.0, "completions/mean_length": 641.0, "completions/mean_terminated_length": 641.0, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9095337390899658, "epoch": 0.00633, "frac_reward_zero_std": 0.0, "grad_norm": 0.5694666504859924, "kl": 1.1563314497470856, "learning_rate": 9.999958770800176e-06, "loss": -0.0473, "num_tokens": 14546830.0, "reward": 0.8617339134216309, "reward_std": 2.1787381172180176, "rewards/rollout_reward_func/mean": 0.8617339134216309, "rewards/rollout_reward_func/std": 3.468179225921631, "sampling/importance_sampling_ratio/max": 1.0103405714035034, "sampling/importance_sampling_ratio/mean": 0.40763962268829346, "sampling/importance_sampling_ratio/min": 0.0042762490920722485, "sampling/sampling_logp_difference/max": 3.0050923824310303, "sampling/sampling_logp_difference/mean": 0.29615509510040283, "step": 633, "step_time": 12.686146467998697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.902773231267929, "epoch": 0.00634, "grad_norm": 0.403490275144577, "kl": 1.1293634176254272, "learning_rate": 9.999958632563477e-06, "loss": -0.0474, "step": 634, "step_time": 6.892940701998668 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01226076576858759, "completions/clipped_ratio": 0.03125, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 253.6875, "completions/mean_terminated_length": 261.3548278808594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9010340012609959, "epoch": 0.00635, "frac_reward_zero_std": 0.5, "grad_norm": 0.11950567364692688, "kl": 0.814842002466321, "learning_rate": 9.999958494095424e-06, "loss": -0.0361, "num_tokens": 14588080.0, "reward": 3.751854419708252, "reward_std": 1.3082363605499268, "rewards/rollout_reward_func/mean": 3.751854419708252, "rewards/rollout_reward_func/std": 1.840853214263916, "sampling/importance_sampling_ratio/max": 0.9740232229232788, "sampling/importance_sampling_ratio/mean": 0.7390027046203613, "sampling/importance_sampling_ratio/min": 1.696749620805349e-07, "sampling/sampling_logp_difference/max": 3.747145891189575, "sampling/sampling_logp_difference/mean": 0.20019342005252838, "step": 635, "step_time": 10.167632999002308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.8895482160151005, "epoch": 0.00636, "grad_norm": 0.11076949536800385, "kl": 0.8179183686152101, "learning_rate": 9.999958355396011e-06, "loss": -0.0363, "step": 636, "step_time": 5.711176624001382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 418.40625, "completions/mean_terminated_length": 418.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1017285510897636, "epoch": 0.00637, "frac_reward_zero_std": 0.0, "grad_norm": 0.6274805068969727, "kl": 0.7497348934412003, "learning_rate": 9.999958216465244e-06, "loss": -0.0324, "num_tokens": 14638276.0, "reward": 0.8811352252960205, "reward_std": 1.939726710319519, "rewards/rollout_reward_func/mean": 0.8811352252960205, "rewards/rollout_reward_func/std": 3.525312900543213, "sampling/importance_sampling_ratio/max": 0.9783987998962402, "sampling/importance_sampling_ratio/mean": 0.630449116230011, "sampling/importance_sampling_ratio/min": 0.022300677374005318, "sampling/sampling_logp_difference/max": 2.4952352046966553, "sampling/sampling_logp_difference/mean": 0.16973555088043213, "step": 637, "step_time": 11.824788485006138 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.014756944496184587, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020438762847334146, "entropy": 1.0928854681551456, "epoch": 0.00638, "grad_norm": 0.47242894768714905, "kl": 0.7280199397355318, "learning_rate": 9.99995807730312e-06, "loss": -0.0346, "step": 638, "step_time": 6.059368253001594 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 633.0, "completions/max_terminated_length": 633.0, "completions/mean_length": 205.21875, "completions/mean_terminated_length": 205.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5031264182180166, "epoch": 0.00639, "frac_reward_zero_std": 0.5, "grad_norm": 0.5332826972007751, "kl": 0.48189798276871443, "learning_rate": 9.999957937909638e-06, "loss": -0.0233, "num_tokens": 14677454.0, "reward": 3.7067623138427734, "reward_std": 1.0329461097717285, "rewards/rollout_reward_func/mean": 3.7067623138427734, "rewards/rollout_reward_func/std": 1.7722975015640259, "sampling/importance_sampling_ratio/max": 0.9898112416267395, "sampling/importance_sampling_ratio/mean": 0.8714514970779419, "sampling/importance_sampling_ratio/min": 9.654356463095897e-17, "sampling/sampling_logp_difference/max": 15.946939468383789, "sampling/sampling_logp_difference/mean": 0.2729547619819641, "step": 639, "step_time": 8.920413215000735 }, { "clip_ratio/high_max": 0.04513888992369175, "clip_ratio/high_mean": 0.022569444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.022569444961845875, "entropy": 0.46903982758522034, "epoch": 0.0064, "grad_norm": 0.04859568923711777, "kl": 0.47033386304974556, "learning_rate": 9.9999577982848e-06, "loss": -0.0248, "step": 640, "step_time": 4.5504143350008235 }, { "clip_ratio/high_max": 0.021780303679406643, "clip_ratio/high_mean": 0.010890151839703321, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010890151839703321, "completions/clipped_ratio": 0.0, "completions/max_length": 1247.0, "completions/max_terminated_length": 1247.0, "completions/mean_length": 440.40625, "completions/mean_terminated_length": 440.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1696774438023567, "epoch": 0.00641, "frac_reward_zero_std": 0.0, "grad_norm": 0.40062767267227173, "kl": 0.8524576015770435, "learning_rate": 9.999957658428603e-06, "loss": -0.0348, "num_tokens": 14727442.0, "reward": 1.3328903913497925, "reward_std": 1.9296107292175293, "rewards/rollout_reward_func/mean": 1.3328903913497925, "rewards/rollout_reward_func/std": 3.3019583225250244, "sampling/importance_sampling_ratio/max": 0.9698357582092285, "sampling/importance_sampling_ratio/mean": 0.6180865168571472, "sampling/importance_sampling_ratio/min": 0.025273984298110008, "sampling/sampling_logp_difference/max": 2.2967331409454346, "sampling/sampling_logp_difference/mean": 0.15891844034194946, "step": 641, "step_time": 11.658176643997649 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.03080808138474822, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03648989973589778, "entropy": 1.2125002071261406, "epoch": 0.00642, "grad_norm": 0.26877257227897644, "kl": 0.8491060733795166, "learning_rate": 9.999957518341053e-06, "loss": -0.0368, "step": 642, "step_time": 5.946877609001604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.020312499720603228, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.020312499720603228, "completions/clipped_ratio": 0.0, "completions/max_length": 944.0, "completions/max_terminated_length": 944.0, "completions/mean_length": 412.3125, "completions/mean_terminated_length": 412.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7362595573067665, "epoch": 0.00643, "frac_reward_zero_std": 0.25, "grad_norm": 0.402377724647522, "kl": 1.7846077792346478, "learning_rate": 9.999957378022144e-06, "loss": 0.0066, "num_tokens": 14776830.0, "reward": 0.6791543960571289, "reward_std": 2.06803560256958, "rewards/rollout_reward_func/mean": 0.6791543960571289, "rewards/rollout_reward_func/std": 3.1287834644317627, "sampling/importance_sampling_ratio/max": 1.0333467721939087, "sampling/importance_sampling_ratio/mean": 0.6012892723083496, "sampling/importance_sampling_ratio/min": 6.572490747203119e-07, "sampling/sampling_logp_difference/max": 2.867222785949707, "sampling/sampling_logp_difference/mean": 0.29215073585510254, "step": 643, "step_time": 10.561791199001163 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.040624999441206455, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.05198863614350557, "entropy": 1.7058254927396774, "epoch": 0.00644, "grad_norm": 0.435063898563385, "kl": 1.6699198484420776, "learning_rate": 9.999957237471878e-06, "loss": 0.0048, "step": 644, "step_time": 5.344641216999662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 187.5, "completions/mean_terminated_length": 187.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9700680002570152, "epoch": 0.00645, "frac_reward_zero_std": 0.25, "grad_norm": 0.4845218360424042, "kl": 1.0879684910178185, "learning_rate": 9.999957096690258e-06, "loss": -0.0453, "num_tokens": 14816839.0, "reward": 1.3670560121536255, "reward_std": 2.137669086456299, "rewards/rollout_reward_func/mean": 1.3670560121536255, "rewards/rollout_reward_func/std": 3.450723886489868, "sampling/importance_sampling_ratio/max": 0.9835588335990906, "sampling/importance_sampling_ratio/mean": 0.7673646211624146, "sampling/importance_sampling_ratio/min": 0.011094286106526852, "sampling/sampling_logp_difference/max": 2.1860570907592773, "sampling/sampling_logp_difference/mean": 0.12013061344623566, "step": 645, "step_time": 10.343129519000286 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017708333674818277, "entropy": 0.9260047152638435, "epoch": 0.00646, "grad_norm": 0.800739586353302, "kl": 0.9983628503978252, "learning_rate": 9.999956955677278e-06, "loss": -0.0461, "step": 646, "step_time": 5.442046955000478 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.0182291679084301, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0182291679084301, "completions/clipped_ratio": 0.0, "completions/max_length": 1207.0, "completions/max_terminated_length": 1207.0, "completions/mean_length": 295.625, "completions/mean_terminated_length": 295.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2046060487627983, "epoch": 0.00647, "frac_reward_zero_std": 0.25, "grad_norm": 0.4007033407688141, "kl": 1.0493207685649395, "learning_rate": 9.999956814432943e-06, "loss": 0.0026, "num_tokens": 14861992.0, "reward": 2.137017250061035, "reward_std": 2.6670210361480713, "rewards/rollout_reward_func/mean": 2.137017250061035, "rewards/rollout_reward_func/std": 3.419712781906128, "sampling/importance_sampling_ratio/max": 0.9723027944564819, "sampling/importance_sampling_ratio/mean": 0.6485604643821716, "sampling/importance_sampling_ratio/min": 0.03890863433480263, "sampling/sampling_logp_difference/max": 2.555133581161499, "sampling/sampling_logp_difference/mean": 0.17528605461120605, "step": 647, "step_time": 11.250963220998528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.028645834885537624, "clip_ratio/low_min": 0.015625, "clip_ratio/region_mean": 0.028645834885537624, "entropy": 1.2674690186977386, "epoch": 0.00648, "grad_norm": 0.40541183948516846, "kl": 1.1181132644414902, "learning_rate": 9.99995667295725e-06, "loss": 0.001, "step": 648, "step_time": 5.893909691998488 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1053.0, "completions/max_terminated_length": 1053.0, "completions/mean_length": 498.15625, "completions/mean_terminated_length": 498.15625, "completions/min_length": 294.0, "completions/min_terminated_length": 294.0, "entropy": 1.2129886820912361, "epoch": 0.00649, "frac_reward_zero_std": 0.0, "grad_norm": 0.4585494101047516, "kl": 1.1243792548775673, "learning_rate": 9.999956531250203e-06, "loss": -0.054, "num_tokens": 14916986.0, "reward": 1.9812252521514893, "reward_std": 3.4381186962127686, "rewards/rollout_reward_func/mean": 1.9812252521514893, "rewards/rollout_reward_func/std": 3.3806610107421875, "sampling/importance_sampling_ratio/max": 1.0230871438980103, "sampling/importance_sampling_ratio/mean": 0.6589874029159546, "sampling/importance_sampling_ratio/min": 0.008367743343114853, "sampling/sampling_logp_difference/max": 2.3504819869995117, "sampling/sampling_logp_difference/mean": 0.1694604754447937, "step": 649, "step_time": 11.15429605999816 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2066712751984596, "epoch": 0.0065, "grad_norm": 0.3655455410480499, "kl": 1.1589130237698555, "learning_rate": 9.999956389311796e-06, "loss": -0.0544, "step": 650, "step_time": 5.64978333399813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 432.90625, "completions/mean_terminated_length": 432.90625, "completions/min_length": 310.0, "completions/min_terminated_length": 310.0, "entropy": 0.6469353884458542, "epoch": 0.00651, "frac_reward_zero_std": 0.0, "grad_norm": 0.17504607141017914, "kl": 1.1837955564260483, "learning_rate": 9.999956247142036e-06, "loss": -0.0178, "num_tokens": 14969881.0, "reward": 3.4348134994506836, "reward_std": 1.6794257164001465, "rewards/rollout_reward_func/mean": 3.4348134994506836, "rewards/rollout_reward_func/std": 2.6251301765441895, "sampling/importance_sampling_ratio/max": 1.3574979305267334, "sampling/importance_sampling_ratio/mean": 0.8197981119155884, "sampling/importance_sampling_ratio/min": 0.04192608594894409, "sampling/sampling_logp_difference/max": 1.3272209167480469, "sampling/sampling_logp_difference/mean": 0.06981360912322998, "step": 651, "step_time": 9.684787112004415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6538550220429897, "epoch": 0.00652, "grad_norm": 0.1898570954799652, "kl": 1.1870203390717506, "learning_rate": 9.999956104740917e-06, "loss": -0.0178, "step": 652, "step_time": 4.832968143000471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.03125, "completions/max_length": 308.0, "completions/max_terminated_length": 308.0, "completions/mean_length": 22.4375, "completions/mean_terminated_length": 22.645160675048828, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2451050765812397, "epoch": 0.00653, "frac_reward_zero_std": 0.75, "grad_norm": 0.09935812652111053, "kl": 0.6289171865209937, "learning_rate": 9.99995596210844e-06, "loss": -0.0105, "num_tokens": 15001061.0, "reward": 3.31170654296875, "reward_std": 0.8340325951576233, "rewards/rollout_reward_func/mean": 3.31170654296875, "rewards/rollout_reward_func/std": 1.954899787902832, "sampling/importance_sampling_ratio/max": 0.9725627899169922, "sampling/importance_sampling_ratio/mean": 0.7233012914657593, "sampling/importance_sampling_ratio/min": 0.0034936913289129734, "sampling/sampling_logp_difference/max": 2.298006057739258, "sampling/sampling_logp_difference/mean": 0.1797812581062317, "step": 653, "step_time": 7.317887123001128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011101973708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011101973708719015, "entropy": 1.2523799203336239, "epoch": 0.00654, "grad_norm": 0.07142390310764313, "kl": 0.6245444091036916, "learning_rate": 9.99995581924461e-06, "loss": -0.0106, "step": 654, "step_time": 3.581809017001433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 264.15625, "completions/mean_terminated_length": 264.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1374192722141743, "epoch": 0.00655, "frac_reward_zero_std": 0.25, "grad_norm": 0.4590197801589966, "kl": 1.3661166597157717, "learning_rate": 9.999955676149421e-06, "loss": -0.0499, "num_tokens": 15045509.0, "reward": 3.037644863128662, "reward_std": 2.2348392009735107, "rewards/rollout_reward_func/mean": 3.037644863128662, "rewards/rollout_reward_func/std": 2.8819756507873535, "sampling/importance_sampling_ratio/max": 0.9720999598503113, "sampling/importance_sampling_ratio/mean": 0.6977952718734741, "sampling/importance_sampling_ratio/min": 0.010317157953977585, "sampling/sampling_logp_difference/max": 2.819347858428955, "sampling/sampling_logp_difference/mean": 0.1480112373828888, "step": 655, "step_time": 9.717003750001822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "entropy": 1.1306128725409508, "epoch": 0.00656, "grad_norm": 0.22944822907447815, "kl": 1.2826583683490753, "learning_rate": 9.999955532822876e-06, "loss": -0.051, "step": 656, "step_time": 5.875947337997786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1042.0, "completions/max_terminated_length": 1042.0, "completions/mean_length": 323.59375, "completions/mean_terminated_length": 323.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.083445217460394, "epoch": 0.00657, "frac_reward_zero_std": 0.0, "grad_norm": 0.2209833860397339, "kl": 0.9042162708938122, "learning_rate": 9.999955389264975e-06, "loss": -0.0491, "num_tokens": 15092859.0, "reward": 1.236527681350708, "reward_std": 1.908660888671875, "rewards/rollout_reward_func/mean": 1.236527681350708, "rewards/rollout_reward_func/std": 3.48577618598938, "sampling/importance_sampling_ratio/max": 1.0675920248031616, "sampling/importance_sampling_ratio/mean": 0.7598190307617188, "sampling/importance_sampling_ratio/min": 2.2797865312895738e-05, "sampling/sampling_logp_difference/max": 3.2936158180236816, "sampling/sampling_logp_difference/mean": 0.1782781332731247, "step": 657, "step_time": 10.022322180999254 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.0764825493097305, "epoch": 0.00658, "grad_norm": 0.2557292878627777, "kl": 0.9672804065048695, "learning_rate": 9.999955245475716e-06, "loss": -0.0487, "step": 658, "step_time": 6.143416751001496 }, { "clip_ratio/high_max": 0.02434239676222205, "clip_ratio/high_mean": 0.012171198381111026, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018421198474243283, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 533.25, "completions/mean_terminated_length": 533.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1227431818842888, "epoch": 0.00659, "frac_reward_zero_std": 0.25, "grad_norm": 0.41725948452949524, "kl": 0.8480838816612959, "learning_rate": 9.9999551014551e-06, "loss": -0.0189, "num_tokens": 15146090.0, "reward": 3.095029830932617, "reward_std": 1.6683357954025269, "rewards/rollout_reward_func/mean": 3.095029830932617, "rewards/rollout_reward_func/std": 2.6952426433563232, "sampling/importance_sampling_ratio/max": 0.9724170565605164, "sampling/importance_sampling_ratio/mean": 0.6109325885772705, "sampling/importance_sampling_ratio/min": 5.812678864458576e-05, "sampling/sampling_logp_difference/max": 2.572464942932129, "sampling/sampling_logp_difference/mean": 0.15728434920310974, "step": 659, "step_time": 11.105535618999056 }, { "clip_ratio/high_max": 0.008620689623057842, "clip_ratio/high_mean": 0.004310344811528921, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010560344904661179, "entropy": 1.1317437440156937, "epoch": 0.0066, "grad_norm": 0.1714155077934265, "kl": 0.8564250022172928, "learning_rate": 9.999954957203131e-06, "loss": -0.0192, "step": 660, "step_time": 6.463394429998516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018939394503831863, "completions/clipped_ratio": 0.0, "completions/max_length": 765.0, "completions/max_terminated_length": 765.0, "completions/mean_length": 80.34375, "completions/mean_terminated_length": 80.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0533891078084707, "epoch": 0.00661, "frac_reward_zero_std": 0.25, "grad_norm": 0.5256734490394592, "kl": 0.9258831571787596, "learning_rate": 9.999954812719802e-06, "loss": -0.0203, "num_tokens": 15182429.0, "reward": 2.937382698059082, "reward_std": 2.189143657684326, "rewards/rollout_reward_func/mean": 2.937382698059082, "rewards/rollout_reward_func/std": 2.6105217933654785, "sampling/importance_sampling_ratio/max": 1.0543407201766968, "sampling/importance_sampling_ratio/mean": 0.7292314171791077, "sampling/importance_sampling_ratio/min": 6.361938735373942e-17, "sampling/sampling_logp_difference/max": 16.23630142211914, "sampling/sampling_logp_difference/mean": 0.4716365933418274, "step": 661, "step_time": 9.05494765200092 }, { "clip_ratio/high_max": 0.0416666679084301, "clip_ratio/high_mean": 0.02083333395421505, "clip_ratio/low_mean": 0.03977272845804691, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06060606241226196, "entropy": 0.9782682936638594, "epoch": 0.00662, "grad_norm": 0.17347757518291473, "kl": 0.4815612966194749, "learning_rate": 9.999954668005119e-06, "loss": -0.0226, "step": 662, "step_time": 5.157065107998278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1185.0, "completions/max_terminated_length": 1185.0, "completions/mean_length": 343.75, "completions/mean_terminated_length": 343.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.24800606071949, "epoch": 0.00663, "frac_reward_zero_std": 0.0, "grad_norm": 0.7216439247131348, "kl": 1.767958179116249, "learning_rate": 9.999954523059077e-06, "loss": -0.0183, "num_tokens": 15231385.0, "reward": 1.5581519603729248, "reward_std": 2.28558349609375, "rewards/rollout_reward_func/mean": 1.5581519603729248, "rewards/rollout_reward_func/std": 3.42142915725708, "sampling/importance_sampling_ratio/max": 1.1143218278884888, "sampling/importance_sampling_ratio/mean": 0.6937257051467896, "sampling/importance_sampling_ratio/min": 0.002521720714867115, "sampling/sampling_logp_difference/max": 2.5521674156188965, "sampling/sampling_logp_difference/mean": 0.2008143961429596, "step": 663, "step_time": 10.692989361999935 }, { "clip_ratio/high_max": 0.03869047714397311, "clip_ratio/high_mean": 0.019345238571986556, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0325396831613034, "entropy": 1.2528190463781357, "epoch": 0.00664, "grad_norm": 0.27700576186180115, "kl": 1.6158296689391136, "learning_rate": 9.999954377881679e-06, "loss": -0.0193, "step": 664, "step_time": 6.412410696002553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 227.3125, "completions/mean_terminated_length": 227.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7660383470356464, "epoch": 0.00665, "frac_reward_zero_std": 0.5, "grad_norm": 0.13410823047161102, "kl": 0.9889683928340673, "learning_rate": 9.999954232472924e-06, "loss": -0.0033, "num_tokens": 15273007.0, "reward": 3.7065539360046387, "reward_std": 0.9398497343063354, "rewards/rollout_reward_func/mean": 3.7065539360046387, "rewards/rollout_reward_func/std": 1.9890903234481812, "sampling/importance_sampling_ratio/max": 0.9728983640670776, "sampling/importance_sampling_ratio/mean": 0.7914444208145142, "sampling/importance_sampling_ratio/min": 0.013864405453205109, "sampling/sampling_logp_difference/max": 2.0393128395080566, "sampling/sampling_logp_difference/mean": 0.09857117384672165, "step": 665, "step_time": 10.363502661002713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7707051485776901, "epoch": 0.00666, "grad_norm": 0.15677975118160248, "kl": 0.9901822376996279, "learning_rate": 9.999954086832815e-06, "loss": -0.0034, "step": 666, "step_time": 6.315572243000133 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01215277798473835, "completions/clipped_ratio": 0.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 250.21875, "completions/mean_terminated_length": 250.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7426019757986069, "epoch": 0.00667, "frac_reward_zero_std": 0.25, "grad_norm": 0.2548815608024597, "kl": 0.2326744943857193, "learning_rate": 9.999953940961346e-06, "loss": -0.0312, "num_tokens": 15314500.0, "reward": 2.692079544067383, "reward_std": 2.4513769149780273, "rewards/rollout_reward_func/mean": 2.692079544067383, "rewards/rollout_reward_func/std": 2.867853879928589, "sampling/importance_sampling_ratio/max": 0.9741671085357666, "sampling/importance_sampling_ratio/mean": 0.7602120637893677, "sampling/importance_sampling_ratio/min": 0.12657208740711212, "sampling/sampling_logp_difference/max": 1.9987499713897705, "sampling/sampling_logp_difference/mean": 0.09521111845970154, "step": 667, "step_time": 10.344749029003651 }, { "clip_ratio/high_max": 0.032638889737427235, "clip_ratio/high_mean": 0.016319444868713617, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.030208333395421505, "entropy": 0.741941636428237, "epoch": 0.00668, "grad_norm": 0.1420123279094696, "kl": 0.23092889226973057, "learning_rate": 9.999953794858524e-06, "loss": -0.0317, "step": 668, "step_time": 5.730196712000179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 462.90625, "completions/mean_terminated_length": 463.2903137207031, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4906889647245407, "epoch": 0.00669, "frac_reward_zero_std": 0.0, "grad_norm": 0.33283931016921997, "kl": 1.2824976220726967, "learning_rate": 9.999953648524343e-06, "loss": -0.0613, "num_tokens": 15367592.0, "reward": 1.5529732704162598, "reward_std": 2.705458879470825, "rewards/rollout_reward_func/mean": 1.5529732704162598, "rewards/rollout_reward_func/std": 3.391281843185425, "sampling/importance_sampling_ratio/max": 0.96673184633255, "sampling/importance_sampling_ratio/mean": 0.6024338603019714, "sampling/importance_sampling_ratio/min": 0.007230240851640701, "sampling/sampling_logp_difference/max": 2.7177155017852783, "sampling/sampling_logp_difference/mean": 0.22089457511901855, "step": 669, "step_time": 11.0836905030028 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.020833333721384406, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024739583721384406, "entropy": 1.5402548760175705, "epoch": 0.0067, "grad_norm": 0.3303788900375366, "kl": 1.277177445590496, "learning_rate": 9.999953501958806e-06, "loss": -0.0625, "step": 670, "step_time": 6.34255052700064 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.03125, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 414.5625, "completions/mean_terminated_length": 412.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8326989505439997, "epoch": 0.00671, "frac_reward_zero_std": 0.25, "grad_norm": 0.16125068068504333, "kl": 0.9622208699584007, "learning_rate": 9.999953355161913e-06, "loss": -0.0403, "num_tokens": 15418858.0, "reward": 3.8179726600646973, "reward_std": 1.930038571357727, "rewards/rollout_reward_func/mean": 3.8179726600646973, "rewards/rollout_reward_func/std": 2.159648895263672, "sampling/importance_sampling_ratio/max": 1.0152547359466553, "sampling/importance_sampling_ratio/mean": 0.7512543201446533, "sampling/importance_sampling_ratio/min": 0.014012412168085575, "sampling/sampling_logp_difference/max": 2.570488929748535, "sampling/sampling_logp_difference/mean": 0.12409421056509018, "step": 671, "step_time": 10.804057120003563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 0.8539750650525093, "epoch": 0.00672, "grad_norm": 0.19485945999622345, "kl": 0.9626263044774532, "learning_rate": 9.999953208133663e-06, "loss": -0.0396, "step": 672, "step_time": 5.833554139999251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1120.0, "completions/max_terminated_length": 1120.0, "completions/mean_length": 292.5625, "completions/mean_terminated_length": 292.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5980728026479483, "epoch": 0.00673, "frac_reward_zero_std": 0.5, "grad_norm": 0.21288256347179413, "kl": 0.5587497316300869, "learning_rate": 9.999953060874058e-06, "loss": -0.0229, "num_tokens": 15461454.0, "reward": 3.2239065170288086, "reward_std": 0.8521279096603394, "rewards/rollout_reward_func/mean": 3.2239065170288086, "rewards/rollout_reward_func/std": 2.412761926651001, "sampling/importance_sampling_ratio/max": 0.97415691614151, "sampling/importance_sampling_ratio/mean": 0.8285682797431946, "sampling/importance_sampling_ratio/min": 0.014087332412600517, "sampling/sampling_logp_difference/max": 2.1129705905914307, "sampling/sampling_logp_difference/mean": 0.07157458364963531, "step": 673, "step_time": 11.071045076998416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6116737239062786, "epoch": 0.00674, "grad_norm": 0.2101197987794876, "kl": 0.5753411203622818, "learning_rate": 9.999952913383096e-06, "loss": -0.0232, "step": 674, "step_time": 5.777149210000061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.03125, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 322.0, "completions/mean_terminated_length": 314.774169921875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.530444074422121, "epoch": 0.00675, "frac_reward_zero_std": 0.25, "grad_norm": 0.2160310596227646, "kl": 3.7108008712530136, "learning_rate": 9.999952765660777e-06, "loss": -0.0473, "num_tokens": 15508558.0, "reward": 4.013862609863281, "reward_std": 1.6688671112060547, "rewards/rollout_reward_func/mean": 4.013862609863281, "rewards/rollout_reward_func/std": 1.8579072952270508, "sampling/importance_sampling_ratio/max": 1.1361182928085327, "sampling/importance_sampling_ratio/mean": 0.8541666269302368, "sampling/importance_sampling_ratio/min": 0.013382199220359325, "sampling/sampling_logp_difference/max": 2.109346628189087, "sampling/sampling_logp_difference/mean": 0.07287147641181946, "step": 675, "step_time": 10.449606047002817 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "entropy": 0.5381306149065495, "epoch": 0.00676, "grad_norm": 0.43795156478881836, "kl": 2.6422742381691933, "learning_rate": 9.999952617707101e-06, "loss": -0.0468, "step": 676, "step_time": 5.272687710003083 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 358.84375, "completions/mean_terminated_length": 358.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2984204813838005, "epoch": 0.00677, "frac_reward_zero_std": 0.25, "grad_norm": 0.8739849925041199, "kl": 0.8083859141916037, "learning_rate": 9.99995246952207e-06, "loss": -0.043, "num_tokens": 15557016.0, "reward": 1.9413164854049683, "reward_std": 2.5160155296325684, "rewards/rollout_reward_func/mean": 1.9413164854049683, "rewards/rollout_reward_func/std": 3.1011528968811035, "sampling/importance_sampling_ratio/max": 0.978008508682251, "sampling/importance_sampling_ratio/mean": 0.66401207447052, "sampling/importance_sampling_ratio/min": 3.5235755979634575e-10, "sampling/sampling_logp_difference/max": 8.836624145507812, "sampling/sampling_logp_difference/mean": 0.29176315665245056, "step": 677, "step_time": 11.577958216003026 }, { "clip_ratio/high_max": 0.03636363707482815, "clip_ratio/high_mean": 0.018181818537414074, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018181818537414074, "entropy": 1.2834895998239517, "epoch": 0.00678, "grad_norm": 0.2518872618675232, "kl": 0.5340706845745444, "learning_rate": 9.999952321105682e-06, "loss": -0.0456, "step": 678, "step_time": 6.018503567998778 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018308081198483706, "completions/clipped_ratio": 0.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 230.4375, "completions/mean_terminated_length": 230.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.378210872411728, "epoch": 0.00679, "frac_reward_zero_std": 0.25, "grad_norm": 0.3196030855178833, "kl": 0.8945608185604215, "learning_rate": 9.999952172457937e-06, "loss": -0.0103, "num_tokens": 15597793.0, "reward": 1.0948052406311035, "reward_std": 1.9075368642807007, "rewards/rollout_reward_func/mean": 1.0948052406311035, "rewards/rollout_reward_func/std": 3.2896203994750977, "sampling/importance_sampling_ratio/max": 0.9731488823890686, "sampling/importance_sampling_ratio/mean": 0.6863678693771362, "sampling/importance_sampling_ratio/min": 0.002166410442441702, "sampling/sampling_logp_difference/max": 3.2171220779418945, "sampling/sampling_logp_difference/mean": 0.23090805113315582, "step": 679, "step_time": 10.413507968998601 }, { "clip_ratio/high_max": 0.022727273404598236, "clip_ratio/high_mean": 0.011363636702299118, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01657197019085288, "entropy": 1.3493057861924171, "epoch": 0.0068, "grad_norm": 0.3109433948993683, "kl": 0.8918113689869642, "learning_rate": 9.999952023578835e-06, "loss": -0.0113, "step": 680, "step_time": 5.375003786997695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1028.0, "completions/max_terminated_length": 1028.0, "completions/mean_length": 492.28125, "completions/mean_terminated_length": 492.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6567766964435577, "epoch": 0.00681, "frac_reward_zero_std": 0.0, "grad_norm": 0.3711012899875641, "kl": 1.2122439499944448, "learning_rate": 9.999951874468378e-06, "loss": -0.0163, "num_tokens": 15652727.0, "reward": 0.8863288164138794, "reward_std": 3.226207733154297, "rewards/rollout_reward_func/mean": 0.8863288164138794, "rewards/rollout_reward_func/std": 3.5649049282073975, "sampling/importance_sampling_ratio/max": 1.0542861223220825, "sampling/importance_sampling_ratio/mean": 0.5232844352722168, "sampling/importance_sampling_ratio/min": 0.012695248238742352, "sampling/sampling_logp_difference/max": 3.0990047454833984, "sampling/sampling_logp_difference/mean": 0.23440498113632202, "step": 681, "step_time": 10.980051268001262 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026562500279396772, "entropy": 1.6738497987389565, "epoch": 0.00682, "grad_norm": 0.3522874116897583, "kl": 1.1976009719073772, "learning_rate": 9.999951725126565e-06, "loss": -0.0176, "step": 682, "step_time": 5.627948364997792 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1320.0, "completions/max_terminated_length": 1320.0, "completions/mean_length": 432.3125, "completions/mean_terminated_length": 432.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9489440843462944, "epoch": 0.00683, "frac_reward_zero_std": 0.25, "grad_norm": 0.1799781322479248, "kl": 1.0546392798423767, "learning_rate": 9.999951575553396e-06, "loss": -0.0581, "num_tokens": 15702424.0, "reward": 1.901170253753662, "reward_std": 2.279921531677246, "rewards/rollout_reward_func/mean": 1.901170253753662, "rewards/rollout_reward_func/std": 3.164052963256836, "sampling/importance_sampling_ratio/max": 1.2514595985412598, "sampling/importance_sampling_ratio/mean": 0.7431653738021851, "sampling/importance_sampling_ratio/min": 0.06787999719381332, "sampling/sampling_logp_difference/max": 1.8494679927825928, "sampling/sampling_logp_difference/mean": 0.11107705533504486, "step": 683, "step_time": 11.670202426999822 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.9578038454055786, "epoch": 0.00684, "grad_norm": 0.19654065370559692, "kl": 1.0439672283828259, "learning_rate": 9.99995142574887e-06, "loss": -0.0585, "step": 684, "step_time": 6.165464283996698 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 1110.0, "completions/max_terminated_length": 1110.0, "completions/mean_length": 553.875, "completions/mean_terminated_length": 553.875, "completions/min_length": 366.0, "completions/min_terminated_length": 366.0, "entropy": 1.3984437733888626, "epoch": 0.00685, "frac_reward_zero_std": 0.0, "grad_norm": 0.5469467639923096, "kl": 1.163693793118, "learning_rate": 9.999951275712985e-06, "loss": -0.031, "num_tokens": 15759243.0, "reward": 2.9250869750976562, "reward_std": 2.5136444568634033, "rewards/rollout_reward_func/mean": 2.9250869750976562, "rewards/rollout_reward_func/std": 3.159245252609253, "sampling/importance_sampling_ratio/max": 1.3637312650680542, "sampling/importance_sampling_ratio/mean": 0.5947450995445251, "sampling/importance_sampling_ratio/min": 0.01316916849464178, "sampling/sampling_logp_difference/max": 2.0515758991241455, "sampling/sampling_logp_difference/mean": 0.19007304310798645, "step": 685, "step_time": 10.521191903999352 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.018181818537414074, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03068181872367859, "entropy": 1.42578936368227, "epoch": 0.00686, "grad_norm": 0.34447818994522095, "kl": 1.1610586494207382, "learning_rate": 9.999951125445747e-06, "loss": -0.0341, "step": 686, "step_time": 6.351040375999219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.03125, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 432.59375, "completions/mean_terminated_length": 427.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.534609667956829, "epoch": 0.00687, "frac_reward_zero_std": 0.0, "grad_norm": 0.7446345686912537, "kl": 0.8782016299664974, "learning_rate": 9.999950974947152e-06, "loss": -0.0202, "num_tokens": 15810425.0, "reward": 1.706829309463501, "reward_std": 2.0286221504211426, "rewards/rollout_reward_func/mean": 1.706829309463501, "rewards/rollout_reward_func/std": 3.4324488639831543, "sampling/importance_sampling_ratio/max": 1.0604833364486694, "sampling/importance_sampling_ratio/mean": 0.551876962184906, "sampling/importance_sampling_ratio/min": 0.0, "sampling/sampling_logp_difference/max": 4.243278980255127, "sampling/sampling_logp_difference/mean": 0.30422544479370117, "step": 687, "step_time": 10.783617786999457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01105769257992506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105769257992506, "entropy": 1.5080619230866432, "epoch": 0.00688, "grad_norm": 0.27799883484840393, "kl": 0.8930287901312113, "learning_rate": 9.999950824217199e-06, "loss": -0.0204, "step": 688, "step_time": 5.8935754940011975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 497.40625, "completions/mean_terminated_length": 497.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3278530091047287, "epoch": 0.00689, "frac_reward_zero_std": 0.0, "grad_norm": 0.2668003737926483, "kl": 1.0513174682855606, "learning_rate": 9.999950673255892e-06, "loss": -0.0337, "num_tokens": 15863422.0, "reward": 2.9587857723236084, "reward_std": 3.1212146282196045, "rewards/rollout_reward_func/mean": 2.9587857723236084, "rewards/rollout_reward_func/std": 3.1030220985412598, "sampling/importance_sampling_ratio/max": 0.9810816049575806, "sampling/importance_sampling_ratio/mean": 0.632847011089325, "sampling/importance_sampling_ratio/min": 0.014979135245084763, "sampling/sampling_logp_difference/max": 2.2442517280578613, "sampling/sampling_logp_difference/mean": 0.17137658596038818, "step": 689, "step_time": 10.887152675002653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3220851570367813, "epoch": 0.0069, "grad_norm": 0.22095012664794922, "kl": 1.0494037698954344, "learning_rate": 9.999950522063228e-06, "loss": -0.0341, "step": 690, "step_time": 6.552611707000324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 505.03125, "completions/mean_terminated_length": 505.03125, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.217762354761362, "epoch": 0.00691, "frac_reward_zero_std": 0.0, "grad_norm": 0.23744578659534454, "kl": 1.392749436199665, "learning_rate": 9.999950370639208e-06, "loss": -0.0396, "num_tokens": 15918771.0, "reward": 3.1029281616210938, "reward_std": 2.3469700813293457, "rewards/rollout_reward_func/mean": 3.1029281616210938, "rewards/rollout_reward_func/std": 2.8528847694396973, "sampling/importance_sampling_ratio/max": 0.9466618299484253, "sampling/importance_sampling_ratio/mean": 0.6792634725570679, "sampling/importance_sampling_ratio/min": 2.7950012028671138e-14, "sampling/sampling_logp_difference/max": 15.162330627441406, "sampling/sampling_logp_difference/mean": 0.3211866021156311, "step": 691, "step_time": 10.638540868005293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.028766026254743338, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028766026254743338, "entropy": 1.2226739786565304, "epoch": 0.00692, "grad_norm": 0.24300573766231537, "kl": 1.3917012810707092, "learning_rate": 9.99995021898383e-06, "loss": -0.0394, "step": 692, "step_time": 6.269375394000235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013194444589316845, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "completions/clipped_ratio": 0.0, "completions/max_length": 1200.0, "completions/max_terminated_length": 1200.0, "completions/mean_length": 347.53125, "completions/mean_terminated_length": 347.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.98963313549757, "epoch": 0.00693, "frac_reward_zero_std": 0.0, "grad_norm": 0.146032452583313, "kl": 0.8821181952953339, "learning_rate": 9.999950067097097e-06, "loss": -0.0377, "num_tokens": 15968034.0, "reward": 2.4868264198303223, "reward_std": 2.2671637535095215, "rewards/rollout_reward_func/mean": 2.4868264198303223, "rewards/rollout_reward_func/std": 3.095346689224243, "sampling/importance_sampling_ratio/max": 0.9600386023521423, "sampling/importance_sampling_ratio/mean": 0.7629973292350769, "sampling/importance_sampling_ratio/min": 5.101572838411208e-19, "sampling/sampling_logp_difference/max": 19.236339569091797, "sampling/sampling_logp_difference/mean": 0.3180384039878845, "step": 693, "step_time": 10.491703471996516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01753472234122455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01753472234122455, "entropy": 1.02033756300807, "epoch": 0.00694, "grad_norm": 0.14267510175704956, "kl": 0.8900450672954321, "learning_rate": 9.999949914979008e-06, "loss": -0.038, "step": 694, "step_time": 6.218028188002791 }, { "clip_ratio/high_max": 0.029513888992369175, "clip_ratio/high_mean": 0.014756944496184587, "clip_ratio/low_mean": 0.044270834885537624, "clip_ratio/low_min": 0.0052083334885537624, "clip_ratio/region_mean": 0.05902777938172221, "completions/clipped_ratio": 0.0, "completions/max_length": 593.0, "completions/max_terminated_length": 593.0, "completions/mean_length": 153.1875, "completions/mean_terminated_length": 153.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4196195341646671, "epoch": 0.00695, "frac_reward_zero_std": 0.0, "grad_norm": 0.5154340267181396, "kl": 3.2963942289352417, "learning_rate": 9.999949762629561e-06, "loss": -0.0152, "num_tokens": 16008910.0, "reward": -0.8506970405578613, "reward_std": 1.4665751457214355, "rewards/rollout_reward_func/mean": -0.8506970405578613, "rewards/rollout_reward_func/std": 3.259373903274536, "sampling/importance_sampling_ratio/max": 2.0835776329040527, "sampling/importance_sampling_ratio/mean": 0.7092008590698242, "sampling/importance_sampling_ratio/min": 6.491773666064755e-17, "sampling/sampling_logp_difference/max": 15.245697021484375, "sampling/sampling_logp_difference/mean": 0.4426763951778412, "step": 695, "step_time": 8.382450400999005 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.03125000046566129, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.038194444961845875, "entropy": 1.4353234842419624, "epoch": 0.00696, "grad_norm": 0.4791974425315857, "kl": 2.761436216533184, "learning_rate": 9.999949610048761e-06, "loss": -0.0169, "step": 696, "step_time": 5.2328747139963525 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.02013507392257452, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024599359836429358, "completions/clipped_ratio": 0.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 686.875, "completions/mean_terminated_length": 686.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.7994730472564697, "epoch": 0.00697, "frac_reward_zero_std": 0.0, "grad_norm": 0.3188488185405731, "kl": 1.5166853666305542, "learning_rate": 9.999949457236602e-06, "loss": -0.019, "num_tokens": 16069178.0, "reward": 0.5976769328117371, "reward_std": 3.3270511627197266, "rewards/rollout_reward_func/mean": 0.5976769328117371, "rewards/rollout_reward_func/std": 3.239279270172119, "sampling/importance_sampling_ratio/max": 0.9441615343093872, "sampling/importance_sampling_ratio/mean": 0.22183361649513245, "sampling/importance_sampling_ratio/min": 5.402185782649464e-19, "sampling/sampling_logp_difference/max": 14.709997177124023, "sampling/sampling_logp_difference/mean": 0.5812075138092041, "step": 697, "step_time": 11.625562820001505 }, { "clip_ratio/high_max": 0.017427884973585606, "clip_ratio/high_mean": 0.008713942486792803, "clip_ratio/low_mean": 0.024942766409367323, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.033656708896160126, "entropy": 2.8165736347436905, "epoch": 0.00698, "grad_norm": 0.17855101823806763, "kl": 1.5243366211652756, "learning_rate": 9.999949304193089e-06, "loss": -0.0193, "step": 698, "step_time": 6.054178389998924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 232.125, "completions/mean_terminated_length": 232.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7215318772941828, "epoch": 0.00699, "frac_reward_zero_std": 0.5, "grad_norm": 0.1280720978975296, "kl": 0.40863119810819626, "learning_rate": 9.999949150918218e-06, "loss": -0.001, "num_tokens": 16109107.0, "reward": 3.890043258666992, "reward_std": 1.2653625011444092, "rewards/rollout_reward_func/mean": 3.890043258666992, "rewards/rollout_reward_func/std": 1.7710245847702026, "sampling/importance_sampling_ratio/max": 0.9749287962913513, "sampling/importance_sampling_ratio/mean": 0.781785249710083, "sampling/importance_sampling_ratio/min": 0.04443052038550377, "sampling/sampling_logp_difference/max": 2.1704089641571045, "sampling/sampling_logp_difference/mean": 0.10603167116641998, "step": 699, "step_time": 10.865474394000557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7324017882347107, "epoch": 0.007, "grad_norm": 0.15147289633750916, "kl": 0.4118319069966674, "learning_rate": 9.99994899741199e-06, "loss": -0.0014, "step": 700, "step_time": 6.251574097004777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "completions/clipped_ratio": 0.0625, "completions/max_length": 600.0, "completions/max_terminated_length": 600.0, "completions/mean_length": 204.4375, "completions/mean_terminated_length": 197.83334350585938, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7615912910550833, "epoch": 0.00701, "frac_reward_zero_std": 0.25, "grad_norm": 0.2675579786300659, "kl": 0.8386121718212962, "learning_rate": 9.99994884367441e-06, "loss": -0.0251, "num_tokens": 16150666.0, "reward": 3.3568310737609863, "reward_std": 1.8282184600830078, "rewards/rollout_reward_func/mean": 3.3568310737609863, "rewards/rollout_reward_func/std": 2.1402878761291504, "sampling/importance_sampling_ratio/max": 0.9730971455574036, "sampling/importance_sampling_ratio/mean": 0.8253008127212524, "sampling/importance_sampling_ratio/min": 5.96077995851374e-07, "sampling/sampling_logp_difference/max": 3.22568941116333, "sampling/sampling_logp_difference/mean": 0.15735632181167603, "step": 701, "step_time": 8.475710431999687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00657894741743803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "entropy": 0.7731505259871483, "epoch": 0.00702, "grad_norm": 0.25490912795066833, "kl": 0.7741247052326798, "learning_rate": 9.99994868970547e-06, "loss": -0.0258, "step": 702, "step_time": 4.636381572001483 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007211538730189204, "completions/clipped_ratio": 0.0, "completions/max_length": 1054.0, "completions/max_terminated_length": 1054.0, "completions/mean_length": 409.84375, "completions/mean_terminated_length": 409.84375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1348957121372223, "epoch": 0.00703, "frac_reward_zero_std": 0.25, "grad_norm": 0.1709185242652893, "kl": 1.0298800207674503, "learning_rate": 9.999948535505173e-06, "loss": -0.0248, "num_tokens": 16200456.0, "reward": 2.3389294147491455, "reward_std": 2.087308883666992, "rewards/rollout_reward_func/mean": 2.3389294147491455, "rewards/rollout_reward_func/std": 3.1245625019073486, "sampling/importance_sampling_ratio/max": 0.9694181680679321, "sampling/importance_sampling_ratio/mean": 0.6739037036895752, "sampling/importance_sampling_ratio/min": 2.5388701033079997e-05, "sampling/sampling_logp_difference/max": 2.515413522720337, "sampling/sampling_logp_difference/mean": 0.1934259980916977, "step": 703, "step_time": 11.03422798900283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1239128932356834, "epoch": 0.00704, "grad_norm": 0.17059028148651123, "kl": 1.0270867627114058, "learning_rate": 9.999948381073523e-06, "loss": -0.0252, "step": 704, "step_time": 6.214435978003166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.03125, "completions/max_length": 1159.0, "completions/max_terminated_length": 1159.0, "completions/mean_length": 257.3125, "completions/mean_terminated_length": 254.9354705810547, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5329257920384407, "epoch": 0.00705, "frac_reward_zero_std": 0.25, "grad_norm": 0.2081996649503708, "kl": 1.0305523881688714, "learning_rate": 9.999948226410515e-06, "loss": -0.0334, "num_tokens": 16244038.0, "reward": 1.0186103582382202, "reward_std": 1.927490472793579, "rewards/rollout_reward_func/mean": 1.0186103582382202, "rewards/rollout_reward_func/std": 3.6701362133026123, "sampling/importance_sampling_ratio/max": 0.9746058583259583, "sampling/importance_sampling_ratio/mean": 0.6426172852516174, "sampling/importance_sampling_ratio/min": 7.931886213379674e-14, "sampling/sampling_logp_difference/max": 12.138961791992188, "sampling/sampling_logp_difference/mean": 0.41534459590911865, "step": 705, "step_time": 10.83377302000008 }, { "clip_ratio/high_max": 0.005434782709926367, "clip_ratio/high_mean": 0.0027173913549631834, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005321558099240065, "entropy": 1.53877342492342, "epoch": 0.00706, "grad_norm": 0.14426468312740326, "kl": 0.9951328439638019, "learning_rate": 9.99994807151615e-06, "loss": -0.0336, "step": 706, "step_time": 5.846542577999571 }, { "clip_ratio/high_max": 0.02003205195069313, "clip_ratio/high_mean": 0.010016025975346565, "clip_ratio/low_mean": 0.004629629664123058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014645655173808336, "completions/clipped_ratio": 0.0625, "completions/max_length": 1218.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 521.4375, "completions/mean_terminated_length": 515.6333618164062, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2834216952323914, "epoch": 0.00707, "frac_reward_zero_std": 0.0, "grad_norm": 0.2228078842163086, "kl": 1.2693811506032944, "learning_rate": 9.999947916390432e-06, "loss": -0.0424, "num_tokens": 16298887.0, "reward": 0.3812609016895294, "reward_std": 3.042724609375, "rewards/rollout_reward_func/mean": 0.3812609016895294, "rewards/rollout_reward_func/std": 3.119166612625122, "sampling/importance_sampling_ratio/max": 0.936414897441864, "sampling/importance_sampling_ratio/mean": 0.34706586599349976, "sampling/importance_sampling_ratio/min": 2.1885723688517232e-14, "sampling/sampling_logp_difference/max": 14.284911155700684, "sampling/sampling_logp_difference/mean": 0.4340340495109558, "step": 707, "step_time": 11.858604929999274 }, { "clip_ratio/high_max": 0.029295182321220636, "clip_ratio/high_mean": 0.014647591160610318, "clip_ratio/low_mean": 0.004629629664123058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019277220824733377, "entropy": 2.285235345363617, "epoch": 0.00708, "grad_norm": 0.2638099193572998, "kl": 1.242487907409668, "learning_rate": 9.999947761033356e-06, "loss": -0.0429, "step": 708, "step_time": 6.6074701309989905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1258.0, "completions/max_terminated_length": 1258.0, "completions/mean_length": 276.375, "completions/mean_terminated_length": 276.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5814052876085043, "epoch": 0.00709, "frac_reward_zero_std": 0.5, "grad_norm": 0.07178284972906113, "kl": 0.6268240576609969, "learning_rate": 9.999947605444923e-06, "loss": -0.0167, "num_tokens": 16342469.0, "reward": 3.6142587661743164, "reward_std": 1.3826792240142822, "rewards/rollout_reward_func/mean": 3.6142587661743164, "rewards/rollout_reward_func/std": 1.9997174739837646, "sampling/importance_sampling_ratio/max": 0.9792645573616028, "sampling/importance_sampling_ratio/mean": 0.8434571027755737, "sampling/importance_sampling_ratio/min": 0.00945564080029726, "sampling/sampling_logp_difference/max": 1.866027593612671, "sampling/sampling_logp_difference/mean": 0.06978556513786316, "step": 709, "step_time": 10.944874740998785 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 0.58700349368155, "epoch": 0.0071, "grad_norm": 0.06273429840803146, "kl": 0.624102552421391, "learning_rate": 9.999947449625135e-06, "loss": -0.0167, "step": 710, "step_time": 6.0947552930010715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00975678744725883, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00975678744725883, "completions/clipped_ratio": 0.03125, "completions/max_length": 1388.0, "completions/max_terminated_length": 1388.0, "completions/mean_length": 403.59375, "completions/mean_terminated_length": 395.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.288341335952282, "epoch": 0.00711, "frac_reward_zero_std": 0.0, "grad_norm": 0.2302219569683075, "kl": 0.6280024256557226, "learning_rate": 9.99994729357399e-06, "loss": -0.0437, "num_tokens": 16392542.0, "reward": 2.768521785736084, "reward_std": 2.509544849395752, "rewards/rollout_reward_func/mean": 2.768521785736084, "rewards/rollout_reward_func/std": 2.712817907333374, "sampling/importance_sampling_ratio/max": 0.9670240879058838, "sampling/importance_sampling_ratio/mean": 0.6343540549278259, "sampling/importance_sampling_ratio/min": 3.7706123494434787e-20, "sampling/sampling_logp_difference/max": 17.559988021850586, "sampling/sampling_logp_difference/mean": 0.29669609665870667, "step": 711, "step_time": 12.478313504001562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3189564496278763, "epoch": 0.00712, "grad_norm": 0.2539701759815216, "kl": 0.6386365722864866, "learning_rate": 9.99994713729149e-06, "loss": -0.0447, "step": 712, "step_time": 6.249661653002477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 292.90625, "completions/mean_terminated_length": 292.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.250430766493082, "epoch": 0.00713, "frac_reward_zero_std": 0.5, "grad_norm": 0.17611828446388245, "kl": 0.40910086035728455, "learning_rate": 9.999946980777633e-06, "loss": -0.0137, "num_tokens": 16434642.0, "reward": 2.1516895294189453, "reward_std": 0.8568745851516724, "rewards/rollout_reward_func/mean": 2.1516895294189453, "rewards/rollout_reward_func/std": 3.0629043579101562, "sampling/importance_sampling_ratio/max": 0.9836852550506592, "sampling/importance_sampling_ratio/mean": 0.7091336250305176, "sampling/importance_sampling_ratio/min": 0.0008134545059874654, "sampling/sampling_logp_difference/max": 2.817943572998047, "sampling/sampling_logp_difference/mean": 0.18526822328567505, "step": 713, "step_time": 11.503574036001737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2705977708101273, "epoch": 0.00714, "grad_norm": 0.18855023384094238, "kl": 0.4138609548099339, "learning_rate": 9.999946824032421e-06, "loss": -0.0139, "step": 714, "step_time": 6.061986191998585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1155.0, "completions/max_terminated_length": 1155.0, "completions/mean_length": 425.75, "completions/mean_terminated_length": 425.75, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5083342641592026, "epoch": 0.00715, "frac_reward_zero_std": 0.0, "grad_norm": 0.23276129364967346, "kl": 1.313523668795824, "learning_rate": 9.999946667055852e-06, "loss": -0.0275, "num_tokens": 16485783.0, "reward": 2.058114528656006, "reward_std": 3.114407777786255, "rewards/rollout_reward_func/mean": 2.058114528656006, "rewards/rollout_reward_func/std": 3.125231981277466, "sampling/importance_sampling_ratio/max": 0.9676843881607056, "sampling/importance_sampling_ratio/mean": 0.5799787044525146, "sampling/importance_sampling_ratio/min": 0.00018542584439273924, "sampling/sampling_logp_difference/max": 2.2613987922668457, "sampling/sampling_logp_difference/mean": 0.22456389665603638, "step": 715, "step_time": 10.856274112002211 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.5315163284540176, "epoch": 0.00716, "grad_norm": 0.14867837727069855, "kl": 1.3264585547149181, "learning_rate": 9.999946509847928e-06, "loss": -0.028, "step": 716, "step_time": 6.48393881600532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1336.0, "completions/max_terminated_length": 1336.0, "completions/mean_length": 652.625, "completions/mean_terminated_length": 652.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.010711260139942, "epoch": 0.00717, "frac_reward_zero_std": 0.0, "grad_norm": 0.14136812090873718, "kl": 1.7092978656291962, "learning_rate": 9.999946352408647e-06, "loss": -0.0548, "num_tokens": 16545524.0, "reward": 1.4913735389709473, "reward_std": 2.8382680416107178, "rewards/rollout_reward_func/mean": 1.4913735389709473, "rewards/rollout_reward_func/std": 3.1095130443573, "sampling/importance_sampling_ratio/max": 0.9587847590446472, "sampling/importance_sampling_ratio/mean": 0.42226600646972656, "sampling/importance_sampling_ratio/min": 0.0004052211588714272, "sampling/sampling_logp_difference/max": 2.705294370651245, "sampling/sampling_logp_difference/mean": 0.3036331832408905, "step": 717, "step_time": 12.422879270001431 }, { "clip_ratio/high_max": 0.017361111473292112, "clip_ratio/high_mean": 0.008680555736646056, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014930555829778314, "entropy": 2.0201208293437958, "epoch": 0.00718, "grad_norm": 0.09366615861654282, "kl": 1.6997064054012299, "learning_rate": 9.99994619473801e-06, "loss": -0.0549, "step": 718, "step_time": 6.332277901001362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 715.0, "completions/max_terminated_length": 715.0, "completions/mean_length": 219.125, "completions/mean_terminated_length": 219.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.866217240691185, "epoch": 0.00719, "frac_reward_zero_std": 0.25, "grad_norm": 0.32234326004981995, "kl": 1.0717353858053684, "learning_rate": 9.999946036836018e-06, "loss": -0.0345, "num_tokens": 16587187.0, "reward": 1.3634614944458008, "reward_std": 1.834265947341919, "rewards/rollout_reward_func/mean": 1.3634614944458008, "rewards/rollout_reward_func/std": 3.6145057678222656, "sampling/importance_sampling_ratio/max": 0.9688012599945068, "sampling/importance_sampling_ratio/mean": 0.5288845300674438, "sampling/importance_sampling_ratio/min": 5.936646630289033e-05, "sampling/sampling_logp_difference/max": 2.266493797302246, "sampling/sampling_logp_difference/mean": 0.2731966972351074, "step": 719, "step_time": 9.045671411000512 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.8418640941381454, "epoch": 0.0072, "grad_norm": 0.2916579246520996, "kl": 1.0404832754284143, "learning_rate": 9.999945878702668e-06, "loss": -0.0356, "step": 720, "step_time": 4.953895949998696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1138.0, "completions/max_terminated_length": 1138.0, "completions/mean_length": 195.9375, "completions/mean_terminated_length": 195.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.4525218531489372, "epoch": 0.00721, "frac_reward_zero_std": 0.5, "grad_norm": 0.050764817744493484, "kl": 0.5766761023551226, "learning_rate": 9.999945720337964e-06, "loss": -0.0196, "num_tokens": 16627844.0, "reward": 3.9200642108917236, "reward_std": 0.8740255832672119, "rewards/rollout_reward_func/mean": 3.9200642108917236, "rewards/rollout_reward_func/std": 1.7278355360031128, "sampling/importance_sampling_ratio/max": 0.9677295684814453, "sampling/importance_sampling_ratio/mean": 0.8989293575286865, "sampling/importance_sampling_ratio/min": 0.014901255257427692, "sampling/sampling_logp_difference/max": 2.132530689239502, "sampling/sampling_logp_difference/mean": 0.04537564516067505, "step": 721, "step_time": 11.008691977000126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.446088045835495, "epoch": 0.00722, "grad_norm": 0.04916887730360031, "kl": 0.5744802076369524, "learning_rate": 9.999945561741904e-06, "loss": -0.0196, "step": 722, "step_time": 5.578622704997542 }, { "clip_ratio/high_max": 0.01854395680129528, "clip_ratio/high_mean": 0.00927197840064764, "clip_ratio/low_mean": 0.004999999888241291, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014271978288888931, "completions/clipped_ratio": 0.0625, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 517.53125, "completions/mean_terminated_length": 516.9000244140625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3707438260316849, "epoch": 0.00723, "frac_reward_zero_std": 0.0, "grad_norm": 0.14564305543899536, "kl": 1.2166639156639576, "learning_rate": 9.999945402914486e-06, "loss": -0.0176, "num_tokens": 16683349.0, "reward": 2.409240961074829, "reward_std": 2.213261842727661, "rewards/rollout_reward_func/mean": 2.409240961074829, "rewards/rollout_reward_func/std": 2.9816102981567383, "sampling/importance_sampling_ratio/max": 0.9655287265777588, "sampling/importance_sampling_ratio/mean": 0.6019376516342163, "sampling/importance_sampling_ratio/min": 9.452952993389944e-21, "sampling/sampling_logp_difference/max": 16.308218002319336, "sampling/sampling_logp_difference/mean": 0.36200931668281555, "step": 723, "step_time": 10.820145634002984 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.00896739144809544, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013775083934888244, "entropy": 1.3556398302316666, "epoch": 0.00724, "grad_norm": 0.13818436861038208, "kl": 1.2206709552556276, "learning_rate": 9.999945243855714e-06, "loss": -0.0177, "step": 724, "step_time": 6.280334747001689 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.03125, "completions/max_length": 1125.0, "completions/max_terminated_length": 1125.0, "completions/mean_length": 346.875, "completions/mean_terminated_length": 357.5483703613281, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3842901214957237, "epoch": 0.00725, "frac_reward_zero_std": 0.25, "grad_norm": 0.10221826285123825, "kl": 0.9481401741504669, "learning_rate": 9.999945084565586e-06, "loss": 0.0138, "num_tokens": 16729007.0, "reward": 0.01418536901473999, "reward_std": 1.3344837427139282, "rewards/rollout_reward_func/mean": 0.01418536901473999, "rewards/rollout_reward_func/std": 2.87434983253479, "sampling/importance_sampling_ratio/max": 0.9659819602966309, "sampling/importance_sampling_ratio/mean": 0.6116534471511841, "sampling/importance_sampling_ratio/min": 0.0025356446858495474, "sampling/sampling_logp_difference/max": 3.077350616455078, "sampling/sampling_logp_difference/mean": 0.2038865089416504, "step": 725, "step_time": 11.181908701002612 }, { "clip_ratio/high_max": 0.018452381249517202, "clip_ratio/high_mean": 0.009226190624758601, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009226190624758601, "entropy": 1.393980573862791, "epoch": 0.00726, "grad_norm": 0.10009457916021347, "kl": 0.9487685337662697, "learning_rate": 9.9999449250441e-06, "loss": 0.0137, "step": 726, "step_time": 5.77676406199862 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 394.125, "completions/mean_terminated_length": 394.125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2320734038949013, "epoch": 0.00727, "frac_reward_zero_std": 0.25, "grad_norm": 0.20286297798156738, "kl": 0.6261159852147102, "learning_rate": 9.99994476529126e-06, "loss": -0.0413, "num_tokens": 16777612.0, "reward": 2.629361152648926, "reward_std": 2.5051417350769043, "rewards/rollout_reward_func/mean": 2.629361152648926, "rewards/rollout_reward_func/std": 3.07417368888855, "sampling/importance_sampling_ratio/max": 1.1448307037353516, "sampling/importance_sampling_ratio/mean": 0.6966780424118042, "sampling/importance_sampling_ratio/min": 0.001949093770235777, "sampling/sampling_logp_difference/max": 2.25054669380188, "sampling/sampling_logp_difference/mean": 0.17139741778373718, "step": 727, "step_time": 10.578929496001365 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 1.2192199006676674, "epoch": 0.00728, "grad_norm": 0.19012540578842163, "kl": 0.6206948347389698, "learning_rate": 9.999944605307064e-06, "loss": -0.0414, "step": 728, "step_time": 5.714140042999134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 342.1875, "completions/mean_terminated_length": 342.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1720299385488033, "epoch": 0.00729, "frac_reward_zero_std": 0.25, "grad_norm": 0.4893914759159088, "kl": 0.6881262706592679, "learning_rate": 9.999944445091512e-06, "loss": -0.043, "num_tokens": 16824496.0, "reward": 2.4250707626342773, "reward_std": 2.4388794898986816, "rewards/rollout_reward_func/mean": 2.4250707626342773, "rewards/rollout_reward_func/std": 2.97749924659729, "sampling/importance_sampling_ratio/max": 1.0407801866531372, "sampling/importance_sampling_ratio/mean": 0.6970431208610535, "sampling/importance_sampling_ratio/min": 0.033134765923023224, "sampling/sampling_logp_difference/max": 2.500720739364624, "sampling/sampling_logp_difference/mean": 0.14663532376289368, "step": 729, "step_time": 11.57646658700287 }, { "clip_ratio/high_max": 0.0193452388048172, "clip_ratio/high_mean": 0.0096726194024086, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015922619495540857, "entropy": 1.1858558766543865, "epoch": 0.0073, "grad_norm": 0.14938488602638245, "kl": 0.6814899332821369, "learning_rate": 9.999944284644603e-06, "loss": -0.0435, "step": 730, "step_time": 6.6498105470018345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1163.0, "completions/max_terminated_length": 1163.0, "completions/mean_length": 182.3125, "completions/mean_terminated_length": 182.3125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5390208959579468, "epoch": 0.00731, "frac_reward_zero_std": 0.5, "grad_norm": 0.23279452323913574, "kl": 0.49125757440924644, "learning_rate": 9.999944123966339e-06, "loss": -0.0249, "num_tokens": 16864843.0, "reward": -0.38415104150772095, "reward_std": 0.9495460987091064, "rewards/rollout_reward_func/mean": -0.38415104150772095, "rewards/rollout_reward_func/std": 3.073472023010254, "sampling/importance_sampling_ratio/max": 0.9566323161125183, "sampling/importance_sampling_ratio/mean": 0.6018893718719482, "sampling/importance_sampling_ratio/min": 0.0072864932008087635, "sampling/sampling_logp_difference/max": 2.095309257507324, "sampling/sampling_logp_difference/mean": 0.19587446749210358, "step": 731, "step_time": 10.454077489001065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.016964286100119352, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016964286100119352, "entropy": 1.5039719194173813, "epoch": 0.00732, "grad_norm": 0.2817038595676422, "kl": 0.48324679397046566, "learning_rate": 9.999943963056718e-06, "loss": -0.0255, "step": 732, "step_time": 5.805401275003533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1307.0, "completions/max_terminated_length": 1307.0, "completions/mean_length": 164.78125, "completions/mean_terminated_length": 164.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1267355307936668, "epoch": 0.00733, "frac_reward_zero_std": 0.25, "grad_norm": 0.257930725812912, "kl": 0.7922486811876297, "learning_rate": 9.999943801915744e-06, "loss": -0.0332, "num_tokens": 16904575.0, "reward": 1.0117430686950684, "reward_std": 0.9772793650627136, "rewards/rollout_reward_func/mean": 1.0117430686950684, "rewards/rollout_reward_func/std": 3.5388147830963135, "sampling/importance_sampling_ratio/max": 0.9670272469520569, "sampling/importance_sampling_ratio/mean": 0.7365247011184692, "sampling/importance_sampling_ratio/min": 0.02672133408486843, "sampling/sampling_logp_difference/max": 1.5663012266159058, "sampling/sampling_logp_difference/mean": 0.13203124701976776, "step": 733, "step_time": 10.991647715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.1084490567445755, "epoch": 0.00734, "grad_norm": 0.2554846405982971, "kl": 0.7941296501085162, "learning_rate": 9.999943640543411e-06, "loss": -0.034, "step": 734, "step_time": 6.6401699500002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 615.0, "completions/max_terminated_length": 615.0, "completions/mean_length": 126.1875, "completions/mean_terminated_length": 113.7741928100586, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8827915340662003, "epoch": 0.00735, "frac_reward_zero_std": 0.5, "grad_norm": 0.2160331904888153, "kl": 1.195009047165513, "learning_rate": 9.999943478939725e-06, "loss": 0.0028, "num_tokens": 16943425.0, "reward": 1.7093276977539062, "reward_std": 0.8124048709869385, "rewards/rollout_reward_func/mean": 1.7093276977539062, "rewards/rollout_reward_func/std": 2.981717348098755, "sampling/importance_sampling_ratio/max": 1.0360583066940308, "sampling/importance_sampling_ratio/mean": 0.8219861388206482, "sampling/importance_sampling_ratio/min": 4.959592068404397e-20, "sampling/sampling_logp_difference/max": 11.995474815368652, "sampling/sampling_logp_difference/mean": 0.3456697463989258, "step": 735, "step_time": 8.744869216996449 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 0.8714844845235348, "epoch": 0.00736, "grad_norm": 0.19736836850643158, "kl": 1.1879268698394299, "learning_rate": 9.99994331710468e-06, "loss": 0.0023, "step": 736, "step_time": 4.644672066004205 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "completions/clipped_ratio": 0.0, "completions/max_length": 1091.0, "completions/max_terminated_length": 1091.0, "completions/mean_length": 257.6875, "completions/mean_terminated_length": 257.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1993912160396576, "epoch": 0.00737, "frac_reward_zero_std": 0.25, "grad_norm": 0.19089308381080627, "kl": 0.721256542019546, "learning_rate": 9.99994315503828e-06, "loss": -0.0427, "num_tokens": 16987298.0, "reward": 2.6339199542999268, "reward_std": 2.2663393020629883, "rewards/rollout_reward_func/mean": 2.6339199542999268, "rewards/rollout_reward_func/std": 2.6304523944854736, "sampling/importance_sampling_ratio/max": 0.9731757640838623, "sampling/importance_sampling_ratio/mean": 0.6953997015953064, "sampling/importance_sampling_ratio/min": 2.5947189638152728e-18, "sampling/sampling_logp_difference/max": 20.009531021118164, "sampling/sampling_logp_difference/mean": 0.34981584548950195, "step": 737, "step_time": 10.797296971999458 }, { "clip_ratio/high_max": 0.02500000037252903, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 1.2079372629523277, "epoch": 0.00738, "grad_norm": 0.17181909084320068, "kl": 0.7181622213684022, "learning_rate": 9.999942992740524e-06, "loss": -0.0427, "step": 738, "step_time": 5.875095522003903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 339.03125, "completions/mean_terminated_length": 339.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0097907856106758, "epoch": 0.00739, "frac_reward_zero_std": 0.0, "grad_norm": 0.2953566908836365, "kl": 0.9018132574856281, "learning_rate": 9.999942830211414e-06, "loss": -0.0286, "num_tokens": 17034998.0, "reward": 0.9504473209381104, "reward_std": 1.7204838991165161, "rewards/rollout_reward_func/mean": 0.9504473209381104, "rewards/rollout_reward_func/std": 3.7884581089019775, "sampling/importance_sampling_ratio/max": 1.2767908573150635, "sampling/importance_sampling_ratio/mean": 0.7440256476402283, "sampling/importance_sampling_ratio/min": 0.005494681186974049, "sampling/sampling_logp_difference/max": 1.6309807300567627, "sampling/sampling_logp_difference/mean": 0.11661377549171448, "step": 739, "step_time": 10.278338381003778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.0139077678322792, "epoch": 0.0074, "grad_norm": 0.273610919713974, "kl": 0.9215409439057112, "learning_rate": 9.999942667450948e-06, "loss": -0.0293, "step": 740, "step_time": 5.492849941001623 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.03125, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 500.125, "completions/mean_terminated_length": 500.9031982421875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5775093883275986, "epoch": 0.00741, "frac_reward_zero_std": 0.25, "grad_norm": 0.3037446439266205, "kl": 1.1547160781919956, "learning_rate": 9.999942504459124e-06, "loss": 0.0017, "num_tokens": 17087590.0, "reward": 1.1941170692443848, "reward_std": 2.1688382625579834, "rewards/rollout_reward_func/mean": 1.1941170692443848, "rewards/rollout_reward_func/std": 2.9673521518707275, "sampling/importance_sampling_ratio/max": 0.9719746708869934, "sampling/importance_sampling_ratio/mean": 0.483052134513855, "sampling/importance_sampling_ratio/min": 0.009636974893510342, "sampling/sampling_logp_difference/max": 2.06992769241333, "sampling/sampling_logp_difference/mean": 0.1999482810497284, "step": 741, "step_time": 10.865904324002258 }, { "clip_ratio/high_max": 0.020979021675884724, "clip_ratio/high_mean": 0.010489510837942362, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015297203324735165, "entropy": 1.5819963663816452, "epoch": 0.00742, "grad_norm": 0.29288551211357117, "kl": 1.1595827676355839, "learning_rate": 9.999942341235946e-06, "loss": 0.0015, "step": 742, "step_time": 6.8411551470017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 472.03125, "completions/mean_terminated_length": 472.03125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1976790614426136, "epoch": 0.00743, "frac_reward_zero_std": 0.0, "grad_norm": 0.2130783051252365, "kl": 1.2141534984111786, "learning_rate": 9.999942177781411e-06, "loss": -0.0225, "num_tokens": 17141438.0, "reward": 1.6698075532913208, "reward_std": 2.885993719100952, "rewards/rollout_reward_func/mean": 1.6698075532913208, "rewards/rollout_reward_func/std": 3.519192934036255, "sampling/importance_sampling_ratio/max": 0.9642454981803894, "sampling/importance_sampling_ratio/mean": 0.6547057628631592, "sampling/importance_sampling_ratio/min": 0.03149237111210823, "sampling/sampling_logp_difference/max": 1.8005121946334839, "sampling/sampling_logp_difference/mean": 0.14075970649719238, "step": 743, "step_time": 10.33696545100429 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014756944496184587, "entropy": 1.1829180419445038, "epoch": 0.00744, "grad_norm": 0.20717108249664307, "kl": 1.2379378229379654, "learning_rate": 9.99994201409552e-06, "loss": -0.023, "step": 744, "step_time": 5.762270341998374 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.01193181797862053, "clip_ratio/low_min": 0.011363636702299118, "clip_ratio/region_mean": 0.016739510465413332, "completions/clipped_ratio": 0.0, "completions/max_length": 1324.0, "completions/max_terminated_length": 1324.0, "completions/mean_length": 319.5625, "completions/mean_terminated_length": 319.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.63786169141531, "epoch": 0.00745, "frac_reward_zero_std": 0.0, "grad_norm": 0.28672468662261963, "kl": 1.208209551870823, "learning_rate": 9.999941850178276e-06, "loss": -0.028, "num_tokens": 17188294.0, "reward": 1.527393102645874, "reward_std": 2.529413938522339, "rewards/rollout_reward_func/mean": 1.527393102645874, "rewards/rollout_reward_func/std": 3.5597145557403564, "sampling/importance_sampling_ratio/max": 0.961635172367096, "sampling/importance_sampling_ratio/mean": 0.5810189843177795, "sampling/importance_sampling_ratio/min": 0.008148299530148506, "sampling/sampling_logp_difference/max": 1.8167121410369873, "sampling/sampling_logp_difference/mean": 0.19034142792224884, "step": 745, "step_time": 11.181518258999859 }, { "clip_ratio/high_max": 0.040865384973585606, "clip_ratio/high_mean": 0.020432692486792803, "clip_ratio/low_mean": 0.031107954680919647, "clip_ratio/low_min": 0.012500000186264515, "clip_ratio/region_mean": 0.05154064716771245, "entropy": 1.5970302298665047, "epoch": 0.00746, "grad_norm": 0.2199828326702118, "kl": 1.202420050278306, "learning_rate": 9.999941686029675e-06, "loss": -0.0287, "step": 746, "step_time": 7.2687514679983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1097.0, "completions/max_terminated_length": 1097.0, "completions/mean_length": 256.0, "completions/mean_terminated_length": 256.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5553111750632524, "epoch": 0.00747, "frac_reward_zero_std": 0.25, "grad_norm": 0.2602752149105072, "kl": 0.7425967678427696, "learning_rate": 9.999941521649716e-06, "loss": -0.0007, "num_tokens": 17232906.0, "reward": 3.1231088638305664, "reward_std": 1.2337977886199951, "rewards/rollout_reward_func/mean": 3.1231088638305664, "rewards/rollout_reward_func/std": 2.7488043308258057, "sampling/importance_sampling_ratio/max": 1.0972802639007568, "sampling/importance_sampling_ratio/mean": 0.8716872930526733, "sampling/importance_sampling_ratio/min": 0.13410542905330658, "sampling/sampling_logp_difference/max": 1.8102099895477295, "sampling/sampling_logp_difference/mean": 0.06416697055101395, "step": 747, "step_time": 10.306842205000066 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014880952890962362, "entropy": 0.5533958058804274, "epoch": 0.00748, "grad_norm": 0.17512263357639313, "kl": 0.7384908003732562, "learning_rate": 9.999941357038403e-06, "loss": -0.0013, "step": 748, "step_time": 5.687685707001947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011363636702299118, "completions/clipped_ratio": 0.03125, "completions/max_length": 1188.0, "completions/max_terminated_length": 1188.0, "completions/mean_length": 439.96875, "completions/mean_terminated_length": 439.6128845214844, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4426124840974808, "epoch": 0.00749, "frac_reward_zero_std": 0.0, "grad_norm": 0.3044229745864868, "kl": 1.5025098584592342, "learning_rate": 9.999941192195736e-06, "loss": -0.0608, "num_tokens": 17285967.0, "reward": 1.2599141597747803, "reward_std": 3.4351470470428467, "rewards/rollout_reward_func/mean": 1.2599141597747803, "rewards/rollout_reward_func/std": 3.564197063446045, "sampling/importance_sampling_ratio/max": 0.9430684447288513, "sampling/importance_sampling_ratio/mean": 0.6277543306350708, "sampling/importance_sampling_ratio/min": 7.498358172597364e-05, "sampling/sampling_logp_difference/max": 2.067519187927246, "sampling/sampling_logp_difference/mean": 0.18985208868980408, "step": 749, "step_time": 11.196529008999278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4358116406947374, "epoch": 0.0075, "grad_norm": 0.27815917134284973, "kl": 1.4587557204067707, "learning_rate": 9.999941027121711e-06, "loss": -0.0614, "step": 750, "step_time": 6.407963655999993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 190.625, "completions/mean_terminated_length": 190.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4358988776803017, "epoch": 0.00751, "frac_reward_zero_std": 0.25, "grad_norm": 0.4224559962749481, "kl": 0.755068538710475, "learning_rate": 9.99994086181633e-06, "loss": -0.0521, "num_tokens": 17327629.0, "reward": 0.13756227493286133, "reward_std": 2.2488155364990234, "rewards/rollout_reward_func/mean": 0.13756227493286133, "rewards/rollout_reward_func/std": 3.2792019844055176, "sampling/importance_sampling_ratio/max": 1.0978341102600098, "sampling/importance_sampling_ratio/mean": 0.7130216360092163, "sampling/importance_sampling_ratio/min": 0.010635466314852238, "sampling/sampling_logp_difference/max": 2.000405788421631, "sampling/sampling_logp_difference/mean": 0.1949203610420227, "step": 751, "step_time": 9.98297038200144 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.4138308018445969, "epoch": 0.00752, "grad_norm": 0.1713394969701767, "kl": 0.7613006811589003, "learning_rate": 9.999940696279595e-06, "loss": -0.0539, "step": 752, "step_time": 5.42038458800198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1032.0, "completions/max_terminated_length": 1032.0, "completions/mean_length": 153.875, "completions/mean_terminated_length": 153.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6828135326504707, "epoch": 0.00753, "frac_reward_zero_std": 0.5, "grad_norm": 0.21687650680541992, "kl": 0.6982456650584936, "learning_rate": 9.999940530511504e-06, "loss": -0.0143, "num_tokens": 17364731.0, "reward": 3.528491973876953, "reward_std": 0.8823238611221313, "rewards/rollout_reward_func/mean": 3.528491973876953, "rewards/rollout_reward_func/std": 1.7654229402542114, "sampling/importance_sampling_ratio/max": 0.9730619192123413, "sampling/importance_sampling_ratio/mean": 0.8124685883522034, "sampling/importance_sampling_ratio/min": 0.10351046174764633, "sampling/sampling_logp_difference/max": 1.396101474761963, "sampling/sampling_logp_difference/mean": 0.0776946023106575, "step": 753, "step_time": 9.677782149996347 }, { "clip_ratio/high_max": 0.0234375, "clip_ratio/high_mean": 0.01171875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 0.6734142825007439, "epoch": 0.00754, "grad_norm": 0.10688108205795288, "kl": 0.7040037037804723, "learning_rate": 9.999940364512057e-06, "loss": -0.0149, "step": 754, "step_time": 6.0774636329988425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 550.0, "completions/max_terminated_length": 550.0, "completions/mean_length": 191.09375, "completions/mean_terminated_length": 191.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6735572256147861, "epoch": 0.00755, "frac_reward_zero_std": 0.5, "grad_norm": 0.02289707399904728, "kl": 0.6639370527118444, "learning_rate": 9.999940198281253e-06, "loss": -0.0209, "num_tokens": 17405689.0, "reward": 2.5171737670898438, "reward_std": 0.6229196786880493, "rewards/rollout_reward_func/mean": 2.5171737670898438, "rewards/rollout_reward_func/std": 3.1630361080169678, "sampling/importance_sampling_ratio/max": 0.9719449281692505, "sampling/importance_sampling_ratio/mean": 0.8259404897689819, "sampling/importance_sampling_ratio/min": 4.968862028205216e-16, "sampling/sampling_logp_difference/max": 14.974823951721191, "sampling/sampling_logp_difference/mean": 0.24025626480579376, "step": 755, "step_time": 9.01882605299761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 0.67923404276371, "epoch": 0.00756, "grad_norm": 0.02462470345199108, "kl": 0.6658830046653748, "learning_rate": 9.999940031819096e-06, "loss": -0.0208, "step": 756, "step_time": 4.455091521002032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 1251.0, "completions/max_terminated_length": 1251.0, "completions/mean_length": 635.40625, "completions/mean_terminated_length": 635.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6234366074204445, "epoch": 0.00757, "frac_reward_zero_std": 0.0, "grad_norm": 0.3025280237197876, "kl": 1.3103451579809189, "learning_rate": 9.99993986512558e-06, "loss": -0.0639, "num_tokens": 17464848.0, "reward": 1.4348536729812622, "reward_std": 3.0463919639587402, "rewards/rollout_reward_func/mean": 1.4348536729812622, "rewards/rollout_reward_func/std": 3.3321425914764404, "sampling/importance_sampling_ratio/max": 0.9380770921707153, "sampling/importance_sampling_ratio/mean": 0.4666872024536133, "sampling/importance_sampling_ratio/min": 0.025721028447151184, "sampling/sampling_logp_difference/max": 2.305488348007202, "sampling/sampling_logp_difference/mean": 0.20797955989837646, "step": 757, "step_time": 11.036256751998735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.6321484223008156, "epoch": 0.00758, "grad_norm": 0.3123398423194885, "kl": 1.323414497077465, "learning_rate": 9.99993969820071e-06, "loss": -0.0641, "step": 758, "step_time": 6.0921100770028715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01105769257992506, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105769257992506, "completions/clipped_ratio": 0.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 369.09375, "completions/mean_terminated_length": 369.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2532411441206932, "epoch": 0.00759, "frac_reward_zero_std": 0.25, "grad_norm": 0.17732438445091248, "kl": 1.3219730406999588, "learning_rate": 9.999939531044486e-06, "loss": -0.0478, "num_tokens": 17513221.0, "reward": 0.0853583812713623, "reward_std": 2.4022164344787598, "rewards/rollout_reward_func/mean": 0.0853583812713623, "rewards/rollout_reward_func/std": 3.217820167541504, "sampling/importance_sampling_ratio/max": 0.9721183180809021, "sampling/importance_sampling_ratio/mean": 0.6172997951507568, "sampling/importance_sampling_ratio/min": 1.9395904018892907e-05, "sampling/sampling_logp_difference/max": 5.417803764343262, "sampling/sampling_logp_difference/mean": 0.189690500497818, "step": 759, "step_time": 11.345131516000038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2485244050621986, "epoch": 0.0076, "grad_norm": 0.1809891313314438, "kl": 1.3447098471224308, "learning_rate": 9.999939363656905e-06, "loss": -0.0482, "step": 760, "step_time": 5.722983423998812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 569.0, "completions/max_terminated_length": 569.0, "completions/mean_length": 35.75, "completions/mean_terminated_length": 35.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6461137514561415, "epoch": 0.00761, "frac_reward_zero_std": 0.5, "grad_norm": 0.23866702616214752, "kl": 0.29634931264445186, "learning_rate": 9.999939196037968e-06, "loss": -0.0189, "num_tokens": 17545295.0, "reward": 2.498307943344116, "reward_std": 1.445556402206421, "rewards/rollout_reward_func/mean": 2.498307943344116, "rewards/rollout_reward_func/std": 2.918619155883789, "sampling/importance_sampling_ratio/max": 1.0028939247131348, "sampling/importance_sampling_ratio/mean": 0.8723725080490112, "sampling/importance_sampling_ratio/min": 0.18300804495811462, "sampling/sampling_logp_difference/max": 1.5337321758270264, "sampling/sampling_logp_difference/mean": 0.07155914604663849, "step": 761, "step_time": 7.849923993999255 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 0.6573045961558819, "epoch": 0.00762, "grad_norm": 0.19798512756824493, "kl": 0.29230749141424894, "learning_rate": 9.999939028187675e-06, "loss": -0.0187, "step": 762, "step_time": 4.321213306999198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 353.71875, "completions/mean_terminated_length": 353.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5607333034276962, "epoch": 0.00763, "frac_reward_zero_std": 0.0, "grad_norm": 0.17701470851898193, "kl": 1.211888611316681, "learning_rate": 9.99993886010603e-06, "loss": -0.0167, "num_tokens": 17594544.0, "reward": -0.399211585521698, "reward_std": 1.5685073137283325, "rewards/rollout_reward_func/mean": -0.399211585521698, "rewards/rollout_reward_func/std": 3.310572624206543, "sampling/importance_sampling_ratio/max": 1.050401210784912, "sampling/importance_sampling_ratio/mean": 0.611096203327179, "sampling/importance_sampling_ratio/min": 0.05832807347178459, "sampling/sampling_logp_difference/max": 1.7343435287475586, "sampling/sampling_logp_difference/mean": 0.182886004447937, "step": 763, "step_time": 11.322131037997679 }, { "clip_ratio/high_max": 0.0386904776096344, "clip_ratio/high_mean": 0.0193452388048172, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02502705715596676, "entropy": 1.5750907063484192, "epoch": 0.00764, "grad_norm": 0.1836613267660141, "kl": 1.2064054384827614, "learning_rate": 9.999938691793026e-06, "loss": -0.0166, "step": 764, "step_time": 5.699718327998198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 571.0, "completions/max_terminated_length": 558.0, "completions/mean_length": 185.875, "completions/mean_terminated_length": 173.4516143798828, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.814325038343668, "epoch": 0.00765, "frac_reward_zero_std": 0.5, "grad_norm": 0.20989732444286346, "kl": 0.36001661140471697, "learning_rate": 9.999938523248667e-06, "loss": -0.037, "num_tokens": 17634213.0, "reward": 3.206308364868164, "reward_std": 1.6104907989501953, "rewards/rollout_reward_func/mean": 3.206308364868164, "rewards/rollout_reward_func/std": 2.415952205657959, "sampling/importance_sampling_ratio/max": 0.9734736680984497, "sampling/importance_sampling_ratio/mean": 0.7829186320304871, "sampling/importance_sampling_ratio/min": 0.01027699839323759, "sampling/sampling_logp_difference/max": 1.9882899522781372, "sampling/sampling_logp_difference/mean": 0.0850411206483841, "step": 765, "step_time": 8.39243431000068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 0.7962513267993927, "epoch": 0.00766, "grad_norm": 0.2052350789308548, "kl": 0.3556864857673645, "learning_rate": 9.99993835447295e-06, "loss": -0.0373, "step": 766, "step_time": 4.397512650000863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 656.0, "completions/max_terminated_length": 656.0, "completions/mean_length": 207.46875, "completions/mean_terminated_length": 207.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.331591174006462, "epoch": 0.00767, "frac_reward_zero_std": 0.25, "grad_norm": 0.20803910493850708, "kl": 0.9200304737314582, "learning_rate": 9.999938185465881e-06, "loss": -0.0597, "num_tokens": 17676172.0, "reward": 0.8099037408828735, "reward_std": 2.351588487625122, "rewards/rollout_reward_func/mean": 0.8099037408828735, "rewards/rollout_reward_func/std": 3.527212142944336, "sampling/importance_sampling_ratio/max": 0.9714179039001465, "sampling/importance_sampling_ratio/mean": 0.6515551805496216, "sampling/importance_sampling_ratio/min": 0.011698137037456036, "sampling/sampling_logp_difference/max": 1.9849380254745483, "sampling/sampling_logp_difference/mean": 0.1796674132347107, "step": 767, "step_time": 9.162983007994626 }, { "clip_ratio/high_max": 0.025240384973585606, "clip_ratio/high_mean": 0.012620192486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 1.298814583569765, "epoch": 0.00768, "grad_norm": 0.14256668090820312, "kl": 0.8964918917044997, "learning_rate": 9.999938016227457e-06, "loss": -0.06, "step": 768, "step_time": 5.370343492002576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 487.34375, "completions/mean_terminated_length": 487.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.508785218000412, "epoch": 0.00769, "frac_reward_zero_std": 0.25, "grad_norm": 0.1851615309715271, "kl": 1.1389975920319557, "learning_rate": 9.999937846757675e-06, "loss": -0.0533, "num_tokens": 17729312.0, "reward": 2.0407652854919434, "reward_std": 2.764073371887207, "rewards/rollout_reward_func/mean": 2.0407652854919434, "rewards/rollout_reward_func/std": 3.19980525970459, "sampling/importance_sampling_ratio/max": 0.9710696935653687, "sampling/importance_sampling_ratio/mean": 0.5523054599761963, "sampling/importance_sampling_ratio/min": 0.02949773520231247, "sampling/sampling_logp_difference/max": 1.7636137008666992, "sampling/sampling_logp_difference/mean": 0.17593953013420105, "step": 769, "step_time": 10.356438244998571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.4895095527172089, "epoch": 0.0077, "grad_norm": 0.18689337372779846, "kl": 1.1368627920746803, "learning_rate": 9.999937677056539e-06, "loss": -0.0534, "step": 770, "step_time": 5.806121367997548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 553.0, "completions/max_terminated_length": 553.0, "completions/mean_length": 187.5625, "completions/mean_terminated_length": 187.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8497284390032291, "epoch": 0.00771, "frac_reward_zero_std": 0.0, "grad_norm": 0.49488136172294617, "kl": 0.7134672049432993, "learning_rate": 9.999937507124047e-06, "loss": -0.0337, "num_tokens": 17771376.0, "reward": 0.9428192377090454, "reward_std": 2.451414108276367, "rewards/rollout_reward_func/mean": 0.9428192377090454, "rewards/rollout_reward_func/std": 3.5838520526885986, "sampling/importance_sampling_ratio/max": 1.1856516599655151, "sampling/importance_sampling_ratio/mean": 0.8366074562072754, "sampling/importance_sampling_ratio/min": 0.06134941428899765, "sampling/sampling_logp_difference/max": 1.8529890775680542, "sampling/sampling_logp_difference/mean": 0.07524767518043518, "step": 771, "step_time": 8.723430508001911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0243055559694767, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0243055559694767, "entropy": 0.8191095441579819, "epoch": 0.00772, "grad_norm": 0.21511362493038177, "kl": 0.7044257782399654, "learning_rate": 9.9999373369602e-06, "loss": -0.0361, "step": 772, "step_time": 4.988994651001121 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "completions/clipped_ratio": 0.0, "completions/max_length": 904.0, "completions/max_terminated_length": 904.0, "completions/mean_length": 480.90625, "completions/mean_terminated_length": 480.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1455512791872025, "epoch": 0.00773, "frac_reward_zero_std": 0.0, "grad_norm": 0.17132797837257385, "kl": 1.3170420825481415, "learning_rate": 9.999937166564996e-06, "loss": -0.0138, "num_tokens": 17825308.0, "reward": 1.6479823589324951, "reward_std": 1.8770095109939575, "rewards/rollout_reward_func/mean": 1.6479823589324951, "rewards/rollout_reward_func/std": 3.3450820446014404, "sampling/importance_sampling_ratio/max": 0.9969393014907837, "sampling/importance_sampling_ratio/mean": 0.6740397214889526, "sampling/importance_sampling_ratio/min": 3.0590669997110354e-09, "sampling/sampling_logp_difference/max": 9.09459114074707, "sampling/sampling_logp_difference/mean": 0.25532352924346924, "step": 773, "step_time": 9.694227826003043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.017708333674818277, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.017708333674818277, "entropy": 1.1352508664131165, "epoch": 0.00774, "grad_norm": 0.15898507833480835, "kl": 1.3306759521365166, "learning_rate": 9.999936995938438e-06, "loss": -0.0136, "step": 774, "step_time": 5.293054572999608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1102.0, "completions/max_terminated_length": 1102.0, "completions/mean_length": 338.96875, "completions/mean_terminated_length": 338.96875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9486048258841038, "epoch": 0.00775, "frac_reward_zero_std": 0.25, "grad_norm": 0.16398753225803375, "kl": 1.0773845948278904, "learning_rate": 9.999936825080522e-06, "loss": -0.0181, "num_tokens": 17872659.0, "reward": 2.2166013717651367, "reward_std": 2.4875636100769043, "rewards/rollout_reward_func/mean": 2.2166013717651367, "rewards/rollout_reward_func/std": 3.269090414047241, "sampling/importance_sampling_ratio/max": 1.098331093788147, "sampling/importance_sampling_ratio/mean": 0.7458733320236206, "sampling/importance_sampling_ratio/min": 0.024114569649100304, "sampling/sampling_logp_difference/max": 2.417893886566162, "sampling/sampling_logp_difference/mean": 0.11896682530641556, "step": 775, "step_time": 10.756038850999175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.955423966050148, "epoch": 0.00776, "grad_norm": 0.17012417316436768, "kl": 1.0829828679561615, "learning_rate": 9.999936653991253e-06, "loss": -0.0185, "step": 776, "step_time": 6.38251793299969 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 1289.0, "completions/max_terminated_length": 1289.0, "completions/mean_length": 442.78125, "completions/mean_terminated_length": 442.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.033454181626439, "epoch": 0.00777, "frac_reward_zero_std": 0.25, "grad_norm": 0.24457508325576782, "kl": 1.2035447396337986, "learning_rate": 9.99993648267063e-06, "loss": -0.0164, "num_tokens": 17924182.0, "reward": -0.34494683146476746, "reward_std": 1.843597412109375, "rewards/rollout_reward_func/mean": -0.34494683146476746, "rewards/rollout_reward_func/std": 3.1155457496643066, "sampling/importance_sampling_ratio/max": 0.9716796875, "sampling/importance_sampling_ratio/mean": 0.6549620032310486, "sampling/importance_sampling_ratio/min": 0.0640171617269516, "sampling/sampling_logp_difference/max": 1.9765210151672363, "sampling/sampling_logp_difference/mean": 0.14293554425239563, "step": 777, "step_time": 11.223739840001144 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "entropy": 1.0340547747910023, "epoch": 0.00778, "grad_norm": 0.18204732239246368, "kl": 1.169134585186839, "learning_rate": 9.999936311118648e-06, "loss": -0.0165, "step": 778, "step_time": 6.182884581001417 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013194444589316845, "completions/clipped_ratio": 0.0, "completions/max_length": 1173.0, "completions/max_terminated_length": 1173.0, "completions/mean_length": 398.0, "completions/mean_terminated_length": 398.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.280853172764182, "epoch": 0.00779, "frac_reward_zero_std": 0.0, "grad_norm": 0.4230082929134369, "kl": 1.9452119767665863, "learning_rate": 9.999936139335313e-06, "loss": -0.0271, "num_tokens": 17975527.0, "reward": 2.3869433403015137, "reward_std": 2.4538345336914062, "rewards/rollout_reward_func/mean": 2.3869433403015137, "rewards/rollout_reward_func/std": 3.4601457118988037, "sampling/importance_sampling_ratio/max": 1.106871485710144, "sampling/importance_sampling_ratio/mean": 0.6793982982635498, "sampling/importance_sampling_ratio/min": 1.0537559319345746e-05, "sampling/sampling_logp_difference/max": 2.1425623893737793, "sampling/sampling_logp_difference/mean": 0.18188530206680298, "step": 779, "step_time": 10.328400633998172 }, { "clip_ratio/high_max": 0.038888889364898205, "clip_ratio/high_mean": 0.019444444682449102, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "entropy": 1.3193305395543575, "epoch": 0.0078, "grad_norm": 0.2607039213180542, "kl": 1.8331837505102158, "learning_rate": 9.999935967320623e-06, "loss": -0.0288, "step": 780, "step_time": 6.217766916002802 }, { "clip_ratio/high_max": 0.013602941296994686, "clip_ratio/high_mean": 0.006801470648497343, "clip_ratio/low_mean": 0.003289473708719015, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010090944357216358, "completions/clipped_ratio": 0.03125, "completions/max_length": 1560.0, "completions/max_terminated_length": 1560.0, "completions/mean_length": 715.8125, "completions/mean_terminated_length": 720.51611328125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.1645045280456543, "epoch": 0.00781, "frac_reward_zero_std": 0.25, "grad_norm": 0.16939423978328705, "kl": 1.1946079321205616, "learning_rate": 9.999935795074575e-06, "loss": -0.0073, "num_tokens": 18033440.0, "reward": 1.181983470916748, "reward_std": 2.330935001373291, "rewards/rollout_reward_func/mean": 1.181983470916748, "rewards/rollout_reward_func/std": 3.364081859588623, "sampling/importance_sampling_ratio/max": 0.9756994843482971, "sampling/importance_sampling_ratio/mean": 0.3218323588371277, "sampling/importance_sampling_ratio/min": 1.7285614606343813e-10, "sampling/sampling_logp_difference/max": 17.89606285095215, "sampling/sampling_logp_difference/mean": 0.49892890453338623, "step": 781, "step_time": 13.91378753999743 }, { "clip_ratio/high_max": 0.022058824077248573, "clip_ratio/high_mean": 0.011029412038624287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011029412038624287, "entropy": 2.183238483965397, "epoch": 0.00782, "grad_norm": 0.14838813245296478, "kl": 1.2091335728764534, "learning_rate": 9.999935622597175e-06, "loss": -0.0076, "step": 782, "step_time": 6.733461265002916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1647.0, "completions/max_terminated_length": 1647.0, "completions/mean_length": 553.1875, "completions/mean_terminated_length": 553.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.31675149127841, "epoch": 0.00783, "frac_reward_zero_std": 0.25, "grad_norm": 0.1682482808828354, "kl": 0.9446869790554047, "learning_rate": 9.999935449888417e-06, "loss": -0.0417, "num_tokens": 18086556.0, "reward": 2.761962652206421, "reward_std": 2.4015302658081055, "rewards/rollout_reward_func/mean": 2.761962652206421, "rewards/rollout_reward_func/std": 3.137685537338257, "sampling/importance_sampling_ratio/max": 0.976446270942688, "sampling/importance_sampling_ratio/mean": 0.6148659586906433, "sampling/importance_sampling_ratio/min": 1.2366231771920866e-07, "sampling/sampling_logp_difference/max": 8.743054389953613, "sampling/sampling_logp_difference/mean": 0.25475603342056274, "step": 783, "step_time": 12.23728083700371 }, { "clip_ratio/high_max": 0.025240384973585606, "clip_ratio/high_mean": 0.012620192486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012620192486792803, "entropy": 1.3247065246105194, "epoch": 0.00784, "grad_norm": 0.12190580368041992, "kl": 0.9370971992611885, "learning_rate": 9.999935276948304e-06, "loss": -0.0418, "step": 784, "step_time": 7.367775787000937 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009615384973585606, "completions/clipped_ratio": 0.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 608.3125, "completions/mean_terminated_length": 608.3125, "completions/min_length": 316.0, "completions/min_terminated_length": 316.0, "entropy": 0.7583591677248478, "epoch": 0.00785, "frac_reward_zero_std": 0.0, "grad_norm": 0.20419834554195404, "kl": 1.3393670618534088, "learning_rate": 9.999935103776837e-06, "loss": 0.0033, "num_tokens": 18145316.0, "reward": 2.9514851570129395, "reward_std": 2.3941712379455566, "rewards/rollout_reward_func/mean": 2.9514851570129395, "rewards/rollout_reward_func/std": 3.017740249633789, "sampling/importance_sampling_ratio/max": 0.9468534588813782, "sampling/importance_sampling_ratio/mean": 0.7428745627403259, "sampling/importance_sampling_ratio/min": 0.06360060721635818, "sampling/sampling_logp_difference/max": 1.9730775356292725, "sampling/sampling_logp_difference/mean": 0.08149249851703644, "step": 785, "step_time": 11.557638480002424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.792889766395092, "epoch": 0.00786, "grad_norm": 0.24285300076007843, "kl": 1.3404271751642227, "learning_rate": 9.999934930374015e-06, "loss": 0.0026, "step": 786, "step_time": 6.200578194000627 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 954.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 248.0, "completions/mean_terminated_length": 248.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.288113083690405, "epoch": 0.00787, "frac_reward_zero_std": 0.0, "grad_norm": 0.19931171834468842, "kl": 1.4376744404435158, "learning_rate": 9.999934756739835e-06, "loss": -0.045, "num_tokens": 18190091.0, "reward": 1.3920717239379883, "reward_std": 2.3583600521087646, "rewards/rollout_reward_func/mean": 1.3920717239379883, "rewards/rollout_reward_func/std": 3.6031739711761475, "sampling/importance_sampling_ratio/max": 0.9741820096969604, "sampling/importance_sampling_ratio/mean": 0.6745182275772095, "sampling/importance_sampling_ratio/min": 0.028016487136483192, "sampling/sampling_logp_difference/max": 2.655808210372925, "sampling/sampling_logp_difference/mean": 0.1960229128599167, "step": 787, "step_time": 9.08721583499755 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 1.3222650960087776, "epoch": 0.00788, "grad_norm": 0.1861310750246048, "kl": 1.4383085146546364, "learning_rate": 9.999934582874302e-06, "loss": -0.045, "step": 788, "step_time": 5.517818682003053 }, { "clip_ratio/high_max": 0.004166666883975267, "clip_ratio/high_mean": 0.0020833334419876337, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0020833334419876337, "completions/clipped_ratio": 0.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 732.53125, "completions/mean_terminated_length": 732.53125, "completions/min_length": 380.0, "completions/min_terminated_length": 380.0, "entropy": 1.9744566679000854, "epoch": 0.00789, "frac_reward_zero_std": 0.0, "grad_norm": 0.29975107312202454, "kl": 1.723588764667511, "learning_rate": 9.999934408777411e-06, "loss": -0.0147, "num_tokens": 18251735.0, "reward": 2.0374135971069336, "reward_std": 2.2888362407684326, "rewards/rollout_reward_func/mean": 2.0374135971069336, "rewards/rollout_reward_func/std": 3.2133660316467285, "sampling/importance_sampling_ratio/max": 0.9478480815887451, "sampling/importance_sampling_ratio/mean": 0.39250731468200684, "sampling/importance_sampling_ratio/min": 6.486868073807273e-07, "sampling/sampling_logp_difference/max": 8.984831809997559, "sampling/sampling_logp_difference/mean": 0.3523785471916199, "step": 789, "step_time": 12.25592287499967 }, { "clip_ratio/high_max": 0.016666667070239782, "clip_ratio/high_mean": 0.008333333535119891, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333535119891, "entropy": 2.0031811892986298, "epoch": 0.0079, "grad_norm": 0.231133371591568, "kl": 1.6324723586440086, "learning_rate": 9.999934234449167e-06, "loss": -0.015, "step": 790, "step_time": 6.558242818993676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 205.0625, "completions/mean_terminated_length": 205.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5288402549922466, "epoch": 0.00791, "frac_reward_zero_std": 0.5, "grad_norm": 0.13874849677085876, "kl": 0.7343283221125603, "learning_rate": 9.999934059889568e-06, "loss": -0.0169, "num_tokens": 18292537.0, "reward": 2.630756378173828, "reward_std": 1.2142844200134277, "rewards/rollout_reward_func/mean": 2.630756378173828, "rewards/rollout_reward_func/std": 2.8661136627197266, "sampling/importance_sampling_ratio/max": 1.0697771310806274, "sampling/importance_sampling_ratio/mean": 0.8585436940193176, "sampling/importance_sampling_ratio/min": 0.1800524741411209, "sampling/sampling_logp_difference/max": 1.5645456314086914, "sampling/sampling_logp_difference/mean": 0.05773542448878288, "step": 791, "step_time": 10.078299428998434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5491988323628902, "epoch": 0.00792, "grad_norm": 0.1492188423871994, "kl": 0.734172910451889, "learning_rate": 9.999933885098614e-06, "loss": -0.0168, "step": 792, "step_time": 5.869524057005037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1607.0, "completions/max_terminated_length": 1607.0, "completions/mean_length": 406.53125, "completions/mean_terminated_length": 406.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7518869936466217, "epoch": 0.00793, "frac_reward_zero_std": 0.0, "grad_norm": 0.23864798247814178, "kl": 0.8781761098653078, "learning_rate": 9.999933710076304e-06, "loss": -0.0443, "num_tokens": 18342515.0, "reward": 1.1522314548492432, "reward_std": 3.5632412433624268, "rewards/rollout_reward_func/mean": 1.1522314548492432, "rewards/rollout_reward_func/std": 3.592491626739502, "sampling/importance_sampling_ratio/max": 0.9519428610801697, "sampling/importance_sampling_ratio/mean": 0.4778412878513336, "sampling/importance_sampling_ratio/min": 0.0010966836707666516, "sampling/sampling_logp_difference/max": 2.576050281524658, "sampling/sampling_logp_difference/mean": 0.24096538126468658, "step": 793, "step_time": 12.361214763002863 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.7471209466457367, "epoch": 0.00794, "grad_norm": 0.27427148818969727, "kl": 0.895180206745863, "learning_rate": 9.999933534822638e-06, "loss": -0.0445, "step": 794, "step_time": 6.582677616997898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1214.0, "completions/max_terminated_length": 1214.0, "completions/mean_length": 480.28125, "completions/mean_terminated_length": 480.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.319069042801857, "epoch": 0.00795, "frac_reward_zero_std": 0.25, "grad_norm": 0.14490990340709686, "kl": 0.8696244470775127, "learning_rate": 9.999933359337616e-06, "loss": -0.0085, "num_tokens": 18393684.0, "reward": 1.2844234704971313, "reward_std": 2.165283441543579, "rewards/rollout_reward_func/mean": 1.2844234704971313, "rewards/rollout_reward_func/std": 3.108370304107666, "sampling/importance_sampling_ratio/max": 0.9724529385566711, "sampling/importance_sampling_ratio/mean": 0.6148220300674438, "sampling/importance_sampling_ratio/min": 4.14161040680483e-05, "sampling/sampling_logp_difference/max": 2.3136720657348633, "sampling/sampling_logp_difference/mean": 0.20987261831760406, "step": 795, "step_time": 10.56462400399505 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010489510837942362, "entropy": 1.319218173623085, "epoch": 0.00796, "grad_norm": 0.12503676116466522, "kl": 0.8674016231670976, "learning_rate": 9.999933183621241e-06, "loss": -0.0088, "step": 796, "step_time": 5.854765805999705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02142857201397419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02142857201397419, "completions/clipped_ratio": 0.0, "completions/max_length": 1278.0, "completions/max_terminated_length": 1278.0, "completions/mean_length": 518.34375, "completions/mean_terminated_length": 518.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.4349012672901154, "epoch": 0.00797, "frac_reward_zero_std": 0.0, "grad_norm": 0.13000009953975677, "kl": 1.2007866613566875, "learning_rate": 9.99993300767351e-06, "loss": -0.0618, "num_tokens": 18449488.0, "reward": 2.3832879066467285, "reward_std": 2.1315245628356934, "rewards/rollout_reward_func/mean": 2.3832879066467285, "rewards/rollout_reward_func/std": 3.168912172317505, "sampling/importance_sampling_ratio/max": 1.5650736093521118, "sampling/importance_sampling_ratio/mean": 0.6399573087692261, "sampling/importance_sampling_ratio/min": 0.0010746747720986605, "sampling/sampling_logp_difference/max": 3.2913520336151123, "sampling/sampling_logp_difference/mean": 0.22994834184646606, "step": 797, "step_time": 11.555918092000866 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "entropy": 1.4479553252458572, "epoch": 0.00798, "grad_norm": 0.18294738233089447, "kl": 1.2107014991343021, "learning_rate": 9.999932831494424e-06, "loss": -0.0616, "step": 798, "step_time": 6.7913441410019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.03125, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 479.90625, "completions/mean_terminated_length": 481.3548278808594, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8192110359668732, "epoch": 0.00799, "frac_reward_zero_std": 0.0, "grad_norm": 0.3143521845340729, "kl": 1.2416492998600006, "learning_rate": 9.999932655083982e-06, "loss": -0.0328, "num_tokens": 18502454.0, "reward": 1.159393548965454, "reward_std": 2.3666248321533203, "rewards/rollout_reward_func/mean": 1.159393548965454, "rewards/rollout_reward_func/std": 3.2976458072662354, "sampling/importance_sampling_ratio/max": 0.9741641283035278, "sampling/importance_sampling_ratio/mean": 0.48439621925354004, "sampling/importance_sampling_ratio/min": 0.0004069458518642932, "sampling/sampling_logp_difference/max": 2.896735191345215, "sampling/sampling_logp_difference/mean": 0.2947911024093628, "step": 799, "step_time": 11.76860421299898 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010529891354963183, "entropy": 1.8067255020141602, "epoch": 0.008, "grad_norm": 0.17840810120105743, "kl": 1.247112289071083, "learning_rate": 9.999932478442186e-06, "loss": -0.0334, "step": 800, "step_time": 6.390461748002053 }, { "clip_ratio/high_max": 0.016281513031572104, "clip_ratio/high_mean": 0.008140756515786052, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01382257486693561, "completions/clipped_ratio": 0.0, "completions/max_length": 1440.0, "completions/max_terminated_length": 1440.0, "completions/mean_length": 676.25, "completions/mean_terminated_length": 676.25, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.0142200589179993, "epoch": 0.00801, "frac_reward_zero_std": 0.0, "grad_norm": 0.5223026275634766, "kl": 1.6775896027684212, "learning_rate": 9.999932301569035e-06, "loss": -0.0372, "num_tokens": 18563801.0, "reward": 1.8602116107940674, "reward_std": 3.2979302406311035, "rewards/rollout_reward_func/mean": 1.8602116107940674, "rewards/rollout_reward_func/std": 3.2512528896331787, "sampling/importance_sampling_ratio/max": 0.967223048210144, "sampling/importance_sampling_ratio/mean": 0.43728697299957275, "sampling/importance_sampling_ratio/min": 8.242063387342569e-08, "sampling/sampling_logp_difference/max": 8.849836349487305, "sampling/sampling_logp_difference/mean": 0.37406113743782043, "step": 801, "step_time": 12.721313709998867 }, { "clip_ratio/high_max": 0.016741071827709675, "clip_ratio/high_mean": 0.008370535913854837, "clip_ratio/low_mean": 0.010890151839703321, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01926068775355816, "entropy": 2.02072561532259, "epoch": 0.00802, "grad_norm": 0.2670314908027649, "kl": 1.4493482802063227, "learning_rate": 9.999932124464527e-06, "loss": -0.0386, "step": 802, "step_time": 7.047327074002169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1087.0, "completions/max_terminated_length": 1087.0, "completions/mean_length": 282.78125, "completions/mean_terminated_length": 282.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8050033524632454, "epoch": 0.00803, "frac_reward_zero_std": 0.25, "grad_norm": 0.32520022988319397, "kl": 0.712996993213892, "learning_rate": 9.999931947128666e-06, "loss": -0.0342, "num_tokens": 18609071.0, "reward": 2.881807804107666, "reward_std": 1.9023600816726685, "rewards/rollout_reward_func/mean": 2.881807804107666, "rewards/rollout_reward_func/std": 2.9272050857543945, "sampling/importance_sampling_ratio/max": 0.9745005965232849, "sampling/importance_sampling_ratio/mean": 0.7766885161399841, "sampling/importance_sampling_ratio/min": 0.060998979955911636, "sampling/sampling_logp_difference/max": 2.4204704761505127, "sampling/sampling_logp_difference/mean": 0.08524975925683975, "step": 803, "step_time": 10.426058027000181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8010074198246002, "epoch": 0.00804, "grad_norm": 0.24006427824497223, "kl": 0.7288035936653614, "learning_rate": 9.999931769561446e-06, "loss": -0.0348, "step": 804, "step_time": 5.629629878001651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1090.0, "completions/max_terminated_length": 1090.0, "completions/mean_length": 377.34375, "completions/mean_terminated_length": 377.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5479185208678246, "epoch": 0.00805, "frac_reward_zero_std": 0.25, "grad_norm": 0.2347497045993805, "kl": 1.0095960684120655, "learning_rate": 9.999931591762874e-06, "loss": -0.0484, "num_tokens": 18659005.0, "reward": 1.1900895833969116, "reward_std": 2.6806793212890625, "rewards/rollout_reward_func/mean": 1.1900895833969116, "rewards/rollout_reward_func/std": 3.359032392501831, "sampling/importance_sampling_ratio/max": 0.9709169864654541, "sampling/importance_sampling_ratio/mean": 0.5726343393325806, "sampling/importance_sampling_ratio/min": 0.002942431950941682, "sampling/sampling_logp_difference/max": 2.4349563121795654, "sampling/sampling_logp_difference/mean": 0.21232274174690247, "step": 805, "step_time": 10.940604695002548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5387066751718521, "epoch": 0.00806, "grad_norm": 0.2377120852470398, "kl": 1.008693777024746, "learning_rate": 9.999931413732947e-06, "loss": -0.0486, "step": 806, "step_time": 6.3419125890013675 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012500000186264515, "completions/clipped_ratio": 0.0, "completions/max_length": 1123.0, "completions/max_terminated_length": 1123.0, "completions/mean_length": 408.09375, "completions/mean_terminated_length": 408.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3630025796592236, "epoch": 0.00807, "frac_reward_zero_std": 0.25, "grad_norm": 0.15637555718421936, "kl": 0.9347991831600666, "learning_rate": 9.999931235471665e-06, "loss": -0.0227, "num_tokens": 18707813.0, "reward": 3.678755283355713, "reward_std": 1.34188973903656, "rewards/rollout_reward_func/mean": 3.678755283355713, "rewards/rollout_reward_func/std": 2.229951858520508, "sampling/importance_sampling_ratio/max": 0.9737840294837952, "sampling/importance_sampling_ratio/mean": 0.648072361946106, "sampling/importance_sampling_ratio/min": 0.0008105289307422936, "sampling/sampling_logp_difference/max": 2.455310583114624, "sampling/sampling_logp_difference/mean": 0.20758572220802307, "step": 807, "step_time": 10.503189343004124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3706167489290237, "epoch": 0.00808, "grad_norm": 0.16853845119476318, "kl": 0.915478965267539, "learning_rate": 9.999931056979026e-06, "loss": -0.0231, "step": 808, "step_time": 5.772254422001424 }, { "clip_ratio/high_max": 0.011386639904230833, "clip_ratio/high_mean": 0.0056933199521154165, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009599569952115417, "completions/clipped_ratio": 0.0, "completions/max_length": 1735.0, "completions/max_terminated_length": 1735.0, "completions/mean_length": 770.53125, "completions/mean_terminated_length": 770.53125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 2.276660054922104, "epoch": 0.00809, "frac_reward_zero_std": 0.0, "grad_norm": 0.1956978738307953, "kl": 1.402255803346634, "learning_rate": 9.999930878255035e-06, "loss": -0.0535, "num_tokens": 18771470.0, "reward": 1.407801628112793, "reward_std": 3.528730630874634, "rewards/rollout_reward_func/mean": 1.407801628112793, "rewards/rollout_reward_func/std": 3.362034320831299, "sampling/importance_sampling_ratio/max": 0.971396803855896, "sampling/importance_sampling_ratio/mean": 0.3172434866428375, "sampling/importance_sampling_ratio/min": 5.179843641685278e-16, "sampling/sampling_logp_difference/max": 14.733379364013672, "sampling/sampling_logp_difference/mean": 0.49121612310409546, "step": 809, "step_time": 13.753987420999692 }, { "clip_ratio/high_max": 0.015865385066717863, "clip_ratio/high_mean": 0.007932692533358932, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012740385020151734, "entropy": 2.2321717590093613, "epoch": 0.0081, "grad_norm": 0.15713199973106384, "kl": 1.382628656923771, "learning_rate": 9.999930699299686e-06, "loss": -0.0541, "step": 810, "step_time": 7.852223380999931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1210.0, "completions/max_terminated_length": 1210.0, "completions/mean_length": 357.59375, "completions/mean_terminated_length": 357.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7411064617335796, "epoch": 0.00811, "frac_reward_zero_std": 0.25, "grad_norm": 0.489723801612854, "kl": 0.8348565269261599, "learning_rate": 9.999930520112983e-06, "loss": -0.0191, "num_tokens": 18818806.0, "reward": 2.8880910873413086, "reward_std": 2.3430070877075195, "rewards/rollout_reward_func/mean": 2.8880910873413086, "rewards/rollout_reward_func/std": 2.852797746658325, "sampling/importance_sampling_ratio/max": 1.0964163541793823, "sampling/importance_sampling_ratio/mean": 0.8246892690658569, "sampling/importance_sampling_ratio/min": 0.02334657870233059, "sampling/sampling_logp_difference/max": 2.240049123764038, "sampling/sampling_logp_difference/mean": 0.10363821685314178, "step": 811, "step_time": 10.70006816199566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 0.7386311199516058, "epoch": 0.00812, "grad_norm": 0.20869359374046326, "kl": 0.9386582113802433, "learning_rate": 9.999930340694926e-06, "loss": -0.0192, "step": 812, "step_time": 5.958414069000355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 386.28125, "completions/mean_terminated_length": 386.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1321880705654621, "epoch": 0.00813, "frac_reward_zero_std": 0.0, "grad_norm": 0.41631582379341125, "kl": 1.1390374265611172, "learning_rate": 9.999930161045511e-06, "loss": -0.0244, "num_tokens": 18869026.0, "reward": 1.6832501888275146, "reward_std": 2.54915189743042, "rewards/rollout_reward_func/mean": 1.6832501888275146, "rewards/rollout_reward_func/std": 3.5093727111816406, "sampling/importance_sampling_ratio/max": 1.3374595642089844, "sampling/importance_sampling_ratio/mean": 0.72395920753479, "sampling/importance_sampling_ratio/min": 0.02250545099377632, "sampling/sampling_logp_difference/max": 2.226951837539673, "sampling/sampling_logp_difference/mean": 0.12901826202869415, "step": 813, "step_time": 11.028947304001122 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.01145833358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000558793545, "entropy": 1.0400078129023314, "epoch": 0.00814, "grad_norm": 0.275250107049942, "kl": 1.1285837814211845, "learning_rate": 9.999929981164743e-06, "loss": -0.0258, "step": 814, "step_time": 6.9895919380014675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1194.0, "completions/max_terminated_length": 1194.0, "completions/mean_length": 344.78125, "completions/mean_terminated_length": 344.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2369124814867973, "epoch": 0.00815, "frac_reward_zero_std": 0.5, "grad_norm": 0.08446843177080154, "kl": 1.297864742577076, "learning_rate": 9.99992980105262e-06, "loss": 0.0019, "num_tokens": 18915583.0, "reward": 1.6207911968231201, "reward_std": 1.500463843345642, "rewards/rollout_reward_func/mean": 1.6207911968231201, "rewards/rollout_reward_func/std": 3.003096342086792, "sampling/importance_sampling_ratio/max": 0.9764278531074524, "sampling/importance_sampling_ratio/mean": 0.6887788772583008, "sampling/importance_sampling_ratio/min": 4.1946990607054076e-17, "sampling/sampling_logp_difference/max": 14.69916820526123, "sampling/sampling_logp_difference/mean": 0.3318294584751129, "step": 815, "step_time": 10.811985641998035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.198555313050747, "epoch": 0.00816, "grad_norm": 0.07622030377388, "kl": 1.3042717725038528, "learning_rate": 9.999929620709142e-06, "loss": 0.0018, "step": 816, "step_time": 5.980438017000779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 679.0, "completions/max_terminated_length": 679.0, "completions/mean_length": 119.25, "completions/mean_terminated_length": 119.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6166456416249275, "epoch": 0.00817, "frac_reward_zero_std": 0.25, "grad_norm": 0.23244494199752808, "kl": 0.9035045681521297, "learning_rate": 9.999929440134308e-06, "loss": -0.023, "num_tokens": 18953669.0, "reward": 2.235772132873535, "reward_std": 0.7875446081161499, "rewards/rollout_reward_func/mean": 2.235772132873535, "rewards/rollout_reward_func/std": 3.0620317459106445, "sampling/importance_sampling_ratio/max": 1.2460662126541138, "sampling/importance_sampling_ratio/mean": 0.8808611631393433, "sampling/importance_sampling_ratio/min": 0.12241169065237045, "sampling/sampling_logp_difference/max": 0.8107170462608337, "sampling/sampling_logp_difference/mean": 0.058285437524318695, "step": 817, "step_time": 8.391435023999293 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.5872910805046558, "epoch": 0.00818, "grad_norm": 0.36136895418167114, "kl": 0.9013215648010373, "learning_rate": 9.99992925932812e-06, "loss": -0.0233, "step": 818, "step_time": 4.866163900001993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1245.0, "completions/max_terminated_length": 1245.0, "completions/mean_length": 429.1875, "completions/mean_terminated_length": 429.1875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0525306575000286, "epoch": 0.00819, "frac_reward_zero_std": 0.25, "grad_norm": 0.10450677573680878, "kl": 0.7028769170865417, "learning_rate": 9.999929078290578e-06, "loss": -0.0391, "num_tokens": 19003314.0, "reward": 3.269468307495117, "reward_std": 1.5403859615325928, "rewards/rollout_reward_func/mean": 3.269468307495117, "rewards/rollout_reward_func/std": 2.641970634460449, "sampling/importance_sampling_ratio/max": 0.9908950328826904, "sampling/importance_sampling_ratio/mean": 0.7301174402236938, "sampling/importance_sampling_ratio/min": 0.021396616473793983, "sampling/sampling_logp_difference/max": 2.023688793182373, "sampling/sampling_logp_difference/mean": 0.12731632590293884, "step": 819, "step_time": 12.069762331997481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0430048108100891, "epoch": 0.0082, "grad_norm": 0.10685921460390091, "kl": 0.6999335866421461, "learning_rate": 9.99992889702168e-06, "loss": -0.0391, "step": 820, "step_time": 6.056164740004533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 557.0, "completions/max_terminated_length": 557.0, "completions/mean_length": 241.28125, "completions/mean_terminated_length": 241.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.46457662992179394, "epoch": 0.00821, "frac_reward_zero_std": 0.25, "grad_norm": 0.3229265511035919, "kl": 0.5220752125605941, "learning_rate": 9.999928715521427e-06, "loss": -0.0444, "num_tokens": 19047745.0, "reward": 3.140475273132324, "reward_std": 1.5113370418548584, "rewards/rollout_reward_func/mean": 3.140475273132324, "rewards/rollout_reward_func/std": 2.870821475982666, "sampling/importance_sampling_ratio/max": 1.0814369916915894, "sampling/importance_sampling_ratio/mean": 0.9104488492012024, "sampling/importance_sampling_ratio/min": 0.041850022971630096, "sampling/sampling_logp_difference/max": 1.6360492706298828, "sampling/sampling_logp_difference/mean": 0.04632795602083206, "step": 821, "step_time": 8.407131983001818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "entropy": 0.4790910519659519, "epoch": 0.00822, "grad_norm": 0.28369036316871643, "kl": 0.5225991504266858, "learning_rate": 9.999928533789818e-06, "loss": -0.0456, "step": 822, "step_time": 5.01480748600261 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.01046836213208735, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01437461213208735, "completions/clipped_ratio": 0.0, "completions/max_length": 1094.0, "completions/max_terminated_length": 1094.0, "completions/mean_length": 337.21875, "completions/mean_terminated_length": 337.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9792143292725086, "epoch": 0.00823, "frac_reward_zero_std": 0.25, "grad_norm": 0.11513926833868027, "kl": 1.1467781253159046, "learning_rate": 9.999928351826855e-06, "loss": -0.0387, "num_tokens": 19095565.0, "reward": 2.104948043823242, "reward_std": 1.6272826194763184, "rewards/rollout_reward_func/mean": 2.104948043823242, "rewards/rollout_reward_func/std": 3.463890314102173, "sampling/importance_sampling_ratio/max": 0.9737833738327026, "sampling/importance_sampling_ratio/mean": 0.7447643876075745, "sampling/importance_sampling_ratio/min": 1.0838417698926151e-17, "sampling/sampling_logp_difference/max": 13.350428581237793, "sampling/sampling_logp_difference/mean": 0.3986027240753174, "step": 823, "step_time": 11.416291718000139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.006048386916518211, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006048386916518211, "entropy": 0.972740612924099, "epoch": 0.00824, "grad_norm": 0.11615967005491257, "kl": 1.1554054114967585, "learning_rate": 9.999928169632538e-06, "loss": -0.0388, "step": 824, "step_time": 5.628350669003339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.03125, "completions/max_length": 1512.0, "completions/max_terminated_length": 1512.0, "completions/mean_length": 227.65625, "completions/mean_terminated_length": 218.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0574997402727604, "epoch": 0.00825, "frac_reward_zero_std": 0.5, "grad_norm": 0.21265123784542084, "kl": 0.6652170680463314, "learning_rate": 9.999927987206866e-06, "loss": -0.0274, "num_tokens": 19135317.0, "reward": 1.9016026258468628, "reward_std": 1.5719878673553467, "rewards/rollout_reward_func/mean": 1.9016026258468628, "rewards/rollout_reward_func/std": 3.233576774597168, "sampling/importance_sampling_ratio/max": 0.9773030877113342, "sampling/importance_sampling_ratio/mean": 0.751845121383667, "sampling/importance_sampling_ratio/min": 9.577904620527988e-07, "sampling/sampling_logp_difference/max": 8.389100074768066, "sampling/sampling_logp_difference/mean": 0.2722112536430359, "step": 825, "step_time": 11.254792007997821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0528681203722954, "epoch": 0.00826, "grad_norm": 0.23965467512607574, "kl": 0.6722673811018467, "learning_rate": 9.999927804549838e-06, "loss": -0.0282, "step": 826, "step_time": 6.507386928000415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 369.40625, "completions/mean_terminated_length": 369.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.85471535846591, "epoch": 0.00827, "frac_reward_zero_std": 0.25, "grad_norm": 0.34557563066482544, "kl": 1.224846974015236, "learning_rate": 9.999927621661456e-06, "loss": -0.0425, "num_tokens": 19183513.0, "reward": 2.2633893489837646, "reward_std": 2.5509796142578125, "rewards/rollout_reward_func/mean": 2.2633893489837646, "rewards/rollout_reward_func/std": 3.023244857788086, "sampling/importance_sampling_ratio/max": 1.0516389608383179, "sampling/importance_sampling_ratio/mean": 0.7375705242156982, "sampling/importance_sampling_ratio/min": 0.1448536068201065, "sampling/sampling_logp_difference/max": 1.7616140842437744, "sampling/sampling_logp_difference/mean": 0.0994252860546112, "step": 827, "step_time": 11.437889544000427 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.012500000186264515, "clip_ratio/low_mean": 0.010714286006987095, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02321428619325161, "entropy": 0.863026075065136, "epoch": 0.00828, "grad_norm": 0.18493139743804932, "kl": 1.2284699194133282, "learning_rate": 9.999927438541716e-06, "loss": -0.0433, "step": 828, "step_time": 5.643608899001265 }, { "clip_ratio/high_max": 0.016666667070239782, "clip_ratio/high_mean": 0.008333333535119891, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013141026021912694, "completions/clipped_ratio": 0.0, "completions/max_length": 1067.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 331.0625, "completions/mean_terminated_length": 331.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.275504246354103, "epoch": 0.00829, "frac_reward_zero_std": 0.25, "grad_norm": 0.13191695511341095, "kl": 1.2363015562295914, "learning_rate": 9.999927255190625e-06, "loss": 0.0093, "num_tokens": 19231011.0, "reward": 2.133061647415161, "reward_std": 2.1665077209472656, "rewards/rollout_reward_func/mean": 2.133061647415161, "rewards/rollout_reward_func/std": 3.450148820877075, "sampling/importance_sampling_ratio/max": 0.9761171340942383, "sampling/importance_sampling_ratio/mean": 0.6634405851364136, "sampling/importance_sampling_ratio/min": 2.5744092594466236e-14, "sampling/sampling_logp_difference/max": 13.563932418823242, "sampling/sampling_logp_difference/mean": 0.4656115174293518, "step": 829, "step_time": 10.59131298200009 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.01743395533412695, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021600622218102217, "entropy": 1.2957862131297588, "epoch": 0.0083, "grad_norm": 0.13100503385066986, "kl": 1.241217814385891, "learning_rate": 9.999927071608178e-06, "loss": 0.0091, "step": 830, "step_time": 5.648426715999449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1133.0, "completions/max_terminated_length": 1133.0, "completions/mean_length": 641.5, "completions/mean_terminated_length": 641.5, "completions/min_length": 260.0, "completions/min_terminated_length": 260.0, "entropy": 1.4421511590480804, "epoch": 0.00831, "frac_reward_zero_std": 0.0, "grad_norm": 0.21294386684894562, "kl": 1.7224566116929054, "learning_rate": 9.999926887794375e-06, "loss": -0.0, "num_tokens": 19291093.0, "reward": 2.303215742111206, "reward_std": 1.6635026931762695, "rewards/rollout_reward_func/mean": 2.303215742111206, "rewards/rollout_reward_func/std": 3.3425967693328857, "sampling/importance_sampling_ratio/max": 0.9503122568130493, "sampling/importance_sampling_ratio/mean": 0.5477883815765381, "sampling/importance_sampling_ratio/min": 0.018022459000349045, "sampling/sampling_logp_difference/max": 2.4260406494140625, "sampling/sampling_logp_difference/mean": 0.17829497158527374, "step": 831, "step_time": 12.08603357100219 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.4239733889698982, "epoch": 0.00832, "grad_norm": 0.23400360345840454, "kl": 1.730831578373909, "learning_rate": 9.99992670374922e-06, "loss": 0.0002, "step": 832, "step_time": 5.940059031005148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1101.0, "completions/max_terminated_length": 1101.0, "completions/mean_length": 315.6875, "completions/mean_terminated_length": 315.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1302087493240833, "epoch": 0.00833, "frac_reward_zero_std": 0.25, "grad_norm": 0.18635205924510956, "kl": 1.3260267870500684, "learning_rate": 9.999926519472708e-06, "loss": -0.029, "num_tokens": 19334322.0, "reward": 2.564579486846924, "reward_std": 2.2590882778167725, "rewards/rollout_reward_func/mean": 2.564579486846924, "rewards/rollout_reward_func/std": 3.0742671489715576, "sampling/importance_sampling_ratio/max": 0.9762728214263916, "sampling/importance_sampling_ratio/mean": 0.7162017822265625, "sampling/importance_sampling_ratio/min": 0.009134447202086449, "sampling/sampling_logp_difference/max": 2.2005581855773926, "sampling/sampling_logp_difference/mean": 0.14779138565063477, "step": 833, "step_time": 10.372289304001242 }, { "clip_ratio/high_max": 0.02291666716337204, "clip_ratio/high_mean": 0.01145833358168602, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01145833358168602, "entropy": 1.1208370253443718, "epoch": 0.00834, "grad_norm": 0.1647341400384903, "kl": 1.3719087643548846, "learning_rate": 9.99992633496484e-06, "loss": -0.0293, "step": 834, "step_time": 5.675593326997841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1524.0, "completions/max_terminated_length": 1524.0, "completions/mean_length": 479.5625, "completions/mean_terminated_length": 477.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.177848931401968, "epoch": 0.00835, "frac_reward_zero_std": 0.25, "grad_norm": 0.26004427671432495, "kl": 0.8498996552079916, "learning_rate": 9.99992615022562e-06, "loss": -0.0161, "num_tokens": 19387310.0, "reward": 3.28096342086792, "reward_std": 1.5733215808868408, "rewards/rollout_reward_func/mean": 3.28096342086792, "rewards/rollout_reward_func/std": 2.588765859603882, "sampling/importance_sampling_ratio/max": 0.9760658144950867, "sampling/importance_sampling_ratio/mean": 0.6828231811523438, "sampling/importance_sampling_ratio/min": 2.8533446311485022e-05, "sampling/sampling_logp_difference/max": 2.3472471237182617, "sampling/sampling_logp_difference/mean": 0.19432218372821808, "step": 835, "step_time": 13.075486190000447 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.015964673832058907, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019870923832058907, "entropy": 1.2032145224511623, "epoch": 0.00836, "grad_norm": 0.22288592159748077, "kl": 0.855853846296668, "learning_rate": 9.999925965255044e-06, "loss": -0.0173, "step": 836, "step_time": 6.616078915005346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0018939394503831863, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0018939394503831863, "completions/clipped_ratio": 0.03125, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 418.375, "completions/mean_terminated_length": 396.8709411621094, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6629215478897095, "epoch": 0.00837, "frac_reward_zero_std": 0.25, "grad_norm": 0.15463200211524963, "kl": 0.8737289756536484, "learning_rate": 9.999925780053113e-06, "loss": -0.0118, "num_tokens": 19435998.0, "reward": 0.00202256441116333, "reward_std": 1.5259088277816772, "rewards/rollout_reward_func/mean": 0.00202256441116333, "rewards/rollout_reward_func/std": 3.3015944957733154, "sampling/importance_sampling_ratio/max": 1.0124064683914185, "sampling/importance_sampling_ratio/mean": 0.5237525701522827, "sampling/importance_sampling_ratio/min": 4.5159966248320416e-05, "sampling/sampling_logp_difference/max": 2.5316734313964844, "sampling/sampling_logp_difference/mean": 0.22647860646247864, "step": 837, "step_time": 11.308426247998796 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 1.6467535495758057, "epoch": 0.00838, "grad_norm": 0.16068631410598755, "kl": 0.8920545224100351, "learning_rate": 9.999925594619828e-06, "loss": -0.0119, "step": 838, "step_time": 5.853660175003824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 213.625, "completions/mean_terminated_length": 213.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.3830721601843834, "epoch": 0.00839, "frac_reward_zero_std": 0.5, "grad_norm": 0.2728428542613983, "kl": 0.33194553665816784, "learning_rate": 9.999925408955188e-06, "loss": -0.0107, "num_tokens": 19476861.0, "reward": 3.980518102645874, "reward_std": 0.8100151419639587, "rewards/rollout_reward_func/mean": 3.980518102645874, "rewards/rollout_reward_func/std": 1.551042079925537, "sampling/importance_sampling_ratio/max": 1.3006104230880737, "sampling/importance_sampling_ratio/mean": 0.9240680932998657, "sampling/importance_sampling_ratio/min": 0.19288617372512817, "sampling/sampling_logp_difference/max": 1.3152766227722168, "sampling/sampling_logp_difference/mean": 0.04014044255018234, "step": 839, "step_time": 10.096617529996365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.3837707694619894, "epoch": 0.0084, "grad_norm": 0.2893379032611847, "kl": 0.3263160502538085, "learning_rate": 9.999925223059192e-06, "loss": -0.0114, "step": 840, "step_time": 6.67860448600004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1622.0, "completions/max_terminated_length": 1622.0, "completions/mean_length": 196.25, "completions/mean_terminated_length": 196.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2935253009200096, "epoch": 0.00841, "frac_reward_zero_std": 0.25, "grad_norm": 0.24984687566757202, "kl": 0.9746513031423092, "learning_rate": 9.999925036931843e-06, "loss": -0.0113, "num_tokens": 19517758.0, "reward": 1.4411475658416748, "reward_std": 2.288214683532715, "rewards/rollout_reward_func/mean": 1.4411475658416748, "rewards/rollout_reward_func/std": 3.17154598236084, "sampling/importance_sampling_ratio/max": 0.9744712710380554, "sampling/importance_sampling_ratio/mean": 0.7092013359069824, "sampling/importance_sampling_ratio/min": 0.0003587428655009717, "sampling/sampling_logp_difference/max": 2.499083995819092, "sampling/sampling_logp_difference/mean": 0.18424759805202484, "step": 841, "step_time": 11.868079499001396 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.2420795261859894, "epoch": 0.00842, "grad_norm": 0.2221299558877945, "kl": 0.9430371131747961, "learning_rate": 9.999924850573138e-06, "loss": -0.012, "step": 842, "step_time": 6.7942896650019975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 1526.0, "completions/max_terminated_length": 1526.0, "completions/mean_length": 369.84375, "completions/mean_terminated_length": 369.84375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6679214388132095, "epoch": 0.00843, "frac_reward_zero_std": 0.25, "grad_norm": 0.17021681368350983, "kl": 0.7964872997254133, "learning_rate": 9.999924663983079e-06, "loss": -0.0564, "num_tokens": 19563326.0, "reward": 2.3204774856567383, "reward_std": 2.318910598754883, "rewards/rollout_reward_func/mean": 2.3204774856567383, "rewards/rollout_reward_func/std": 3.2252705097198486, "sampling/importance_sampling_ratio/max": 0.9762485027313232, "sampling/importance_sampling_ratio/mean": 0.585067868232727, "sampling/importance_sampling_ratio/min": 0.002738070674240589, "sampling/sampling_logp_difference/max": 1.9855422973632812, "sampling/sampling_logp_difference/mean": 0.2359411120414734, "step": 843, "step_time": 11.940626607998638 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010233918204903603, "entropy": 1.6095114164054394, "epoch": 0.00844, "grad_norm": 0.20288600027561188, "kl": 0.7637172834947705, "learning_rate": 9.999924477161667e-06, "loss": -0.0568, "step": 844, "step_time": 7.663082324002971 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 272.15625, "completions/mean_terminated_length": 272.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8863435164093971, "epoch": 0.00845, "frac_reward_zero_std": 0.25, "grad_norm": 0.1869278997182846, "kl": 0.71300726570189, "learning_rate": 9.999924290108896e-06, "loss": -0.0343, "num_tokens": 19608259.0, "reward": 1.838944911956787, "reward_std": 2.17614483833313, "rewards/rollout_reward_func/mean": 1.838944911956787, "rewards/rollout_reward_func/std": 3.4680559635162354, "sampling/importance_sampling_ratio/max": 0.9758765697479248, "sampling/importance_sampling_ratio/mean": 0.7792065143585205, "sampling/importance_sampling_ratio/min": 0.010710548609495163, "sampling/sampling_logp_difference/max": 1.6683954000473022, "sampling/sampling_logp_difference/mean": 0.10457267612218857, "step": 845, "step_time": 9.873097215995585 }, { "clip_ratio/high_max": 0.020312500186264515, "clip_ratio/high_mean": 0.010156250093132257, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010156250093132257, "entropy": 0.855777807533741, "epoch": 0.00846, "grad_norm": 0.13359300792217255, "kl": 0.7138085477054119, "learning_rate": 9.999924102824775e-06, "loss": -0.0345, "step": 846, "step_time": 5.361507627001629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 978.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 293.6875, "completions/mean_terminated_length": 293.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.295758668333292, "epoch": 0.00847, "frac_reward_zero_std": 0.25, "grad_norm": 0.2644861936569214, "kl": 1.0582328587770462, "learning_rate": 9.999923915309297e-06, "loss": -0.0553, "num_tokens": 19653179.0, "reward": 1.2901932001113892, "reward_std": 2.4247283935546875, "rewards/rollout_reward_func/mean": 1.2901932001113892, "rewards/rollout_reward_func/std": 3.618925094604492, "sampling/importance_sampling_ratio/max": 0.976370096206665, "sampling/importance_sampling_ratio/mean": 0.7029586434364319, "sampling/importance_sampling_ratio/min": 1.679690285527613e-05, "sampling/sampling_logp_difference/max": 9.20557689666748, "sampling/sampling_logp_difference/mean": 0.22138386964797974, "step": 847, "step_time": 9.850258398002552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2834103517234325, "epoch": 0.00848, "grad_norm": 0.20868952572345734, "kl": 1.0474590435624123, "learning_rate": 9.999923727562464e-06, "loss": -0.0561, "step": 848, "step_time": 6.260158039001908 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 262.375, "completions/mean_terminated_length": 262.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0289918556809425, "epoch": 0.00849, "frac_reward_zero_std": 0.0, "grad_norm": 0.26336756348609924, "kl": 0.8229996412992477, "learning_rate": 9.999923539584277e-06, "loss": -0.0453, "num_tokens": 19698120.0, "reward": 1.4607117176055908, "reward_std": 3.096541404724121, "rewards/rollout_reward_func/mean": 1.4607117176055908, "rewards/rollout_reward_func/std": 3.248777151107788, "sampling/importance_sampling_ratio/max": 0.9869439601898193, "sampling/importance_sampling_ratio/mean": 0.7155832052230835, "sampling/importance_sampling_ratio/min": 5.33367767729942e-07, "sampling/sampling_logp_difference/max": 9.180076599121094, "sampling/sampling_logp_difference/mean": 0.28234967589378357, "step": 849, "step_time": 10.471180844999253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0134524926543236, "epoch": 0.0085, "grad_norm": 0.2616085708141327, "kl": 0.8204996474087238, "learning_rate": 9.999923351374734e-06, "loss": -0.0459, "step": 850, "step_time": 5.855936493997433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1554.0, "completions/max_terminated_length": 1554.0, "completions/mean_length": 382.6875, "completions/mean_terminated_length": 382.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9931157100945711, "epoch": 0.00851, "frac_reward_zero_std": 0.0, "grad_norm": 0.1759028136730194, "kl": 0.7742234244942665, "learning_rate": 9.999923162933838e-06, "loss": -0.0462, "num_tokens": 19748429.0, "reward": 0.5588206052780151, "reward_std": 1.7402288913726807, "rewards/rollout_reward_func/mean": 0.5588206052780151, "rewards/rollout_reward_func/std": 3.685292959213257, "sampling/importance_sampling_ratio/max": 1.0616997480392456, "sampling/importance_sampling_ratio/mean": 0.7623704671859741, "sampling/importance_sampling_ratio/min": 2.4865532395779155e-05, "sampling/sampling_logp_difference/max": 2.725315809249878, "sampling/sampling_logp_difference/mean": 0.14796200394630432, "step": 851, "step_time": 11.971943902997737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9671148490160704, "epoch": 0.00852, "grad_norm": 0.1999349147081375, "kl": 0.7763453777879477, "learning_rate": 9.999922974261586e-06, "loss": -0.0467, "step": 852, "step_time": 7.6423091200031195 }, { "clip_ratio/high_max": 0.017769608180969954, "clip_ratio/high_mean": 0.008884804090484977, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008884804090484977, "completions/clipped_ratio": 0.0, "completions/max_length": 2075.0, "completions/max_terminated_length": 2075.0, "completions/mean_length": 767.5, "completions/mean_terminated_length": 767.5, "completions/min_length": 7.0, "completions/min_terminated_length": 7.0, "entropy": 1.7088505066931248, "epoch": 0.00853, "frac_reward_zero_std": 0.0, "grad_norm": 0.13763274252414703, "kl": 1.1324502676725388, "learning_rate": 9.999922785357982e-06, "loss": -0.0506, "num_tokens": 19811790.0, "reward": 1.6779522895812988, "reward_std": 3.057373285293579, "rewards/rollout_reward_func/mean": 1.6779522895812988, "rewards/rollout_reward_func/std": 3.2685282230377197, "sampling/importance_sampling_ratio/max": 1.0615938901901245, "sampling/importance_sampling_ratio/mean": 0.5225033760070801, "sampling/importance_sampling_ratio/min": 2.2509865065661883e-12, "sampling/sampling_logp_difference/max": 10.306619644165039, "sampling/sampling_logp_difference/mean": 0.40102535486221313, "step": 853, "step_time": 13.83476032100043 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.7042065113782883, "epoch": 0.00854, "grad_norm": 0.12837789952754974, "kl": 1.1437124088406563, "learning_rate": 9.999922596223021e-06, "loss": -0.051, "step": 854, "step_time": 7.902411219998612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1165.0, "completions/max_terminated_length": 1165.0, "completions/mean_length": 450.96875, "completions/mean_terminated_length": 450.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6872720569372177, "epoch": 0.00855, "frac_reward_zero_std": 0.25, "grad_norm": 0.2522338032722473, "kl": 0.47877639532089233, "learning_rate": 9.999922406856707e-06, "loss": -0.0167, "num_tokens": 19862642.0, "reward": 3.4549546241760254, "reward_std": 1.6175191402435303, "rewards/rollout_reward_func/mean": 3.4549546241760254, "rewards/rollout_reward_func/std": 2.33186674118042, "sampling/importance_sampling_ratio/max": 1.0230448246002197, "sampling/importance_sampling_ratio/mean": 0.7901424169540405, "sampling/importance_sampling_ratio/min": 0.010806052014231682, "sampling/sampling_logp_difference/max": 2.6410317420959473, "sampling/sampling_logp_difference/mean": 0.07919738441705704, "step": 855, "step_time": 10.962223484997594 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009588068351149559, "entropy": 0.6663993000984192, "epoch": 0.00856, "grad_norm": 0.21626004576683044, "kl": 0.4763656910508871, "learning_rate": 9.999922217259037e-06, "loss": -0.0173, "step": 856, "step_time": 6.486571441000706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 337.0625, "completions/mean_terminated_length": 337.0625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.5405413769185543, "epoch": 0.00857, "frac_reward_zero_std": 0.0, "grad_norm": 0.26215121150016785, "kl": 0.856518816202879, "learning_rate": 9.999922027430014e-06, "loss": -0.0181, "num_tokens": 19911115.0, "reward": 2.156710624694824, "reward_std": 2.2734181880950928, "rewards/rollout_reward_func/mean": 2.156710624694824, "rewards/rollout_reward_func/std": 3.5449726581573486, "sampling/importance_sampling_ratio/max": 0.9816198348999023, "sampling/importance_sampling_ratio/mean": 0.8677124977111816, "sampling/importance_sampling_ratio/min": 0.28228139877319336, "sampling/sampling_logp_difference/max": 0.6853672862052917, "sampling/sampling_logp_difference/mean": 0.04343707114458084, "step": 857, "step_time": 12.013285407996591 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011931818444281816, "entropy": 0.5374432392418385, "epoch": 0.00858, "grad_norm": 0.2024705857038498, "kl": 0.867335507646203, "learning_rate": 9.999921837369636e-06, "loss": -0.0189, "step": 858, "step_time": 6.5791704099992785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 728.0, "completions/max_terminated_length": 728.0, "completions/mean_length": 303.59375, "completions/mean_terminated_length": 303.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.2907459009438753, "epoch": 0.00859, "frac_reward_zero_std": 0.25, "grad_norm": 0.2993224859237671, "kl": 0.6656310744583607, "learning_rate": 9.999921647077902e-06, "loss": -0.0108, "num_tokens": 19957632.0, "reward": 3.7293665409088135, "reward_std": 2.0385873317718506, "rewards/rollout_reward_func/mean": 3.7293665409088135, "rewards/rollout_reward_func/std": 2.282806873321533, "sampling/importance_sampling_ratio/max": 1.0106040239334106, "sampling/importance_sampling_ratio/mean": 0.9247844219207764, "sampling/importance_sampling_ratio/min": 0.6141637563705444, "sampling/sampling_logp_difference/max": 0.42775213718414307, "sampling/sampling_logp_difference/mean": 0.0213470458984375, "step": 859, "step_time": 8.967861666998942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.2930980697274208, "epoch": 0.0086, "grad_norm": 0.2657155692577362, "kl": 0.6673802696168423, "learning_rate": 9.999921456554816e-06, "loss": -0.0111, "step": 860, "step_time": 4.913150525999299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013020833488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "completions/clipped_ratio": 0.0, "completions/max_length": 1109.0, "completions/max_terminated_length": 1109.0, "completions/mean_length": 453.5, "completions/mean_terminated_length": 453.5, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.165782667696476, "epoch": 0.00861, "frac_reward_zero_std": 0.0, "grad_norm": 0.17016467452049255, "kl": 0.9220278635621071, "learning_rate": 9.999921265800373e-06, "loss": -0.0297, "num_tokens": 20010851.0, "reward": 1.4482331275939941, "reward_std": 2.628201961517334, "rewards/rollout_reward_func/mean": 1.4482331275939941, "rewards/rollout_reward_func/std": 3.5238850116729736, "sampling/importance_sampling_ratio/max": 0.9760260581970215, "sampling/importance_sampling_ratio/mean": 0.6673387289047241, "sampling/importance_sampling_ratio/min": 0.045266345143318176, "sampling/sampling_logp_difference/max": 1.9781405925750732, "sampling/sampling_logp_difference/mean": 0.13239264488220215, "step": 861, "step_time": 11.475887171000068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010016025975346565, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "entropy": 1.1752088591456413, "epoch": 0.00862, "grad_norm": 0.17617376148700714, "kl": 0.9194905795156956, "learning_rate": 9.999921074814578e-06, "loss": -0.0302, "step": 862, "step_time": 5.543352729999242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 328.5625, "completions/mean_terminated_length": 328.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.47417575865983963, "epoch": 0.00863, "frac_reward_zero_std": 0.5, "grad_norm": 0.27498090267181396, "kl": 1.047588750720024, "learning_rate": 9.999920883597427e-06, "loss": -0.0182, "num_tokens": 20058296.0, "reward": 3.954359531402588, "reward_std": 1.4539613723754883, "rewards/rollout_reward_func/mean": 3.954359531402588, "rewards/rollout_reward_func/std": 2.071218729019165, "sampling/importance_sampling_ratio/max": 1.0906509160995483, "sampling/importance_sampling_ratio/mean": 0.8543949723243713, "sampling/importance_sampling_ratio/min": 0.00650630472227931, "sampling/sampling_logp_difference/max": 1.1912287473678589, "sampling/sampling_logp_difference/mean": 0.06227649375796318, "step": 863, "step_time": 9.603356900001017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.4789976514875889, "epoch": 0.00864, "grad_norm": 0.2743454575538635, "kl": 1.013685680925846, "learning_rate": 9.999920692148921e-06, "loss": -0.0187, "step": 864, "step_time": 5.097819902997799 }, { "clip_ratio/high_max": 0.01315789483487606, "clip_ratio/high_mean": 0.00657894741743803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00657894741743803, "completions/clipped_ratio": 0.0, "completions/max_length": 1198.0, "completions/max_terminated_length": 1198.0, "completions/mean_length": 457.65625, "completions/mean_terminated_length": 457.65625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1817671433091164, "epoch": 0.00865, "frac_reward_zero_std": 0.25, "grad_norm": 0.22734063863754272, "kl": 0.7413392327725887, "learning_rate": 9.999920500469061e-06, "loss": -0.0341, "num_tokens": 20109041.0, "reward": 1.2425944805145264, "reward_std": 1.864460825920105, "rewards/rollout_reward_func/mean": 1.2425944805145264, "rewards/rollout_reward_func/std": 3.2562808990478516, "sampling/importance_sampling_ratio/max": 0.9774843454360962, "sampling/importance_sampling_ratio/mean": 0.614332377910614, "sampling/importance_sampling_ratio/min": 1.2650929405143567e-15, "sampling/sampling_logp_difference/max": 19.815095901489258, "sampling/sampling_logp_difference/mean": 0.3085646331310272, "step": 865, "step_time": 11.277815465000458 }, { "clip_ratio/high_max": 0.03300467738881707, "clip_ratio/high_mean": 0.016502338694408536, "clip_ratio/low_mean": 0.018750000279396772, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03525233897380531, "entropy": 1.1815869435667992, "epoch": 0.00866, "grad_norm": 0.1478610783815384, "kl": 0.715192062780261, "learning_rate": 9.999920308557847e-06, "loss": -0.035, "step": 866, "step_time": 5.759414663001735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 189.34375, "completions/mean_terminated_length": 189.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.30360522121191025, "epoch": 0.00867, "frac_reward_zero_std": 0.5, "grad_norm": 0.21028472483158112, "kl": 0.46588921174407005, "learning_rate": 9.999920116415279e-06, "loss": -0.0034, "num_tokens": 20150839.0, "reward": 4.196728706359863, "reward_std": 0.6034823060035706, "rewards/rollout_reward_func/mean": 4.196728706359863, "rewards/rollout_reward_func/std": 1.269562005996704, "sampling/importance_sampling_ratio/max": 0.9764026403427124, "sampling/importance_sampling_ratio/mean": 0.9167687892913818, "sampling/importance_sampling_ratio/min": 0.5381535291671753, "sampling/sampling_logp_difference/max": 0.5955599546432495, "sampling/sampling_logp_difference/mean": 0.02259233593940735, "step": 867, "step_time": 9.57162493499709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.30768939666450024, "epoch": 0.00868, "grad_norm": 0.19521719217300415, "kl": 0.4656380284577608, "learning_rate": 9.999919924041355e-06, "loss": -0.0035, "step": 868, "step_time": 5.193839401996229 }, { "clip_ratio/high_max": 0.013602941296994686, "clip_ratio/high_mean": 0.006801470648497343, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006801470648497343, "completions/clipped_ratio": 0.0, "completions/max_length": 1573.0, "completions/max_terminated_length": 1573.0, "completions/mean_length": 480.8125, "completions/mean_terminated_length": 480.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2283969670534134, "epoch": 0.00869, "frac_reward_zero_std": 0.25, "grad_norm": 0.26671305298805237, "kl": 0.8439705222845078, "learning_rate": 9.999919731436078e-06, "loss": -0.0497, "num_tokens": 20201657.0, "reward": 1.3862369060516357, "reward_std": 2.1218926906585693, "rewards/rollout_reward_func/mean": 1.3862369060516357, "rewards/rollout_reward_func/std": 3.396171808242798, "sampling/importance_sampling_ratio/max": 0.9946643114089966, "sampling/importance_sampling_ratio/mean": 0.6241289377212524, "sampling/importance_sampling_ratio/min": 0.00015503758913837373, "sampling/sampling_logp_difference/max": 1.660935878753662, "sampling/sampling_logp_difference/mean": 0.1418529748916626, "step": 869, "step_time": 12.825337377000324 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "entropy": 1.2221156731247902, "epoch": 0.0087, "grad_norm": 0.2675642669200897, "kl": 0.8498863652348518, "learning_rate": 9.999919538599448e-06, "loss": -0.0492, "step": 870, "step_time": 6.949954781000997 }, { "clip_ratio/high_max": 0.016741071827709675, "clip_ratio/high_mean": 0.008370535913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008370535913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 498.375, "completions/mean_terminated_length": 498.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3078232146799564, "epoch": 0.00871, "frac_reward_zero_std": 0.25, "grad_norm": 0.1365618258714676, "kl": 0.9965025186538696, "learning_rate": 9.99991934553146e-06, "loss": -0.0424, "num_tokens": 20254016.0, "reward": 2.297941207885742, "reward_std": 2.4802255630493164, "rewards/rollout_reward_func/mean": 2.297941207885742, "rewards/rollout_reward_func/std": 2.934701919555664, "sampling/importance_sampling_ratio/max": 1.008051872253418, "sampling/importance_sampling_ratio/mean": 0.6044878959655762, "sampling/importance_sampling_ratio/min": 1.58086243118305e-06, "sampling/sampling_logp_difference/max": 8.732097625732422, "sampling/sampling_logp_difference/mean": 0.24031123518943787, "step": 871, "step_time": 10.83013034999749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2944601606577635, "epoch": 0.00872, "grad_norm": 0.16566753387451172, "kl": 0.9921777117997408, "learning_rate": 9.99991915223212e-06, "loss": -0.0427, "step": 872, "step_time": 5.780215770000723 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1238.0, "completions/max_terminated_length": 1238.0, "completions/mean_length": 414.5, "completions/mean_terminated_length": 414.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8045420125126839, "epoch": 0.00873, "frac_reward_zero_std": 0.0, "grad_norm": 0.3623804748058319, "kl": 0.9479364678263664, "learning_rate": 9.999918958701426e-06, "loss": -0.0308, "num_tokens": 20303570.0, "reward": 0.05756431818008423, "reward_std": 1.7854658365249634, "rewards/rollout_reward_func/mean": 0.05756431818008423, "rewards/rollout_reward_func/std": 3.3278486728668213, "sampling/importance_sampling_ratio/max": 1.802535891532898, "sampling/importance_sampling_ratio/mean": 0.645935595035553, "sampling/importance_sampling_ratio/min": 0.0016630036989226937, "sampling/sampling_logp_difference/max": 3.2967369556427, "sampling/sampling_logp_difference/mean": 0.28764817118644714, "step": 873, "step_time": 10.725897668999096 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 1.8166773468255997, "epoch": 0.00874, "grad_norm": 0.37513265013694763, "kl": 0.9490221850574017, "learning_rate": 9.999918764939378e-06, "loss": -0.0315, "step": 874, "step_time": 6.789216217999638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1170.0, "completions/max_terminated_length": 1170.0, "completions/mean_length": 512.40625, "completions/mean_terminated_length": 512.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0944659765809774, "epoch": 0.00875, "frac_reward_zero_std": 0.0, "grad_norm": 0.29445958137512207, "kl": 1.1358933560550213, "learning_rate": 9.999918570945971e-06, "loss": -0.0266, "num_tokens": 20358621.0, "reward": 2.183967113494873, "reward_std": 2.657107353210449, "rewards/rollout_reward_func/mean": 2.183967113494873, "rewards/rollout_reward_func/std": 3.5810697078704834, "sampling/importance_sampling_ratio/max": 0.9681951403617859, "sampling/importance_sampling_ratio/mean": 0.6607741713523865, "sampling/importance_sampling_ratio/min": 0.008121808990836143, "sampling/sampling_logp_difference/max": 2.3495020866394043, "sampling/sampling_logp_difference/mean": 0.15993410348892212, "step": 875, "step_time": 10.81412329000159 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 1.1029228568077087, "epoch": 0.00876, "grad_norm": 0.2772195041179657, "kl": 1.1357549130916595, "learning_rate": 9.999918376721215e-06, "loss": -0.0273, "step": 876, "step_time": 5.835054599996511 }, { "clip_ratio/high_max": 0.01875000074505806, "clip_ratio/high_mean": 0.00937500037252903, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00937500037252903, "completions/clipped_ratio": 0.0, "completions/max_length": 1734.0, "completions/max_terminated_length": 1734.0, "completions/mean_length": 469.21875, "completions/mean_terminated_length": 469.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6967902295291424, "epoch": 0.00877, "frac_reward_zero_std": 0.25, "grad_norm": 0.46078017354011536, "kl": 0.7463375125080347, "learning_rate": 9.999918182265103e-06, "loss": -0.0432, "num_tokens": 20410898.0, "reward": 3.0649428367614746, "reward_std": 2.3301610946655273, "rewards/rollout_reward_func/mean": 3.0649428367614746, "rewards/rollout_reward_func/std": 2.854250431060791, "sampling/importance_sampling_ratio/max": 1.0848283767700195, "sampling/importance_sampling_ratio/mean": 0.7470453977584839, "sampling/importance_sampling_ratio/min": 0.03524044528603554, "sampling/sampling_logp_difference/max": 2.5489091873168945, "sampling/sampling_logp_difference/mean": 0.0952567607164383, "step": 877, "step_time": 12.344379961003142 }, { "clip_ratio/high_max": 0.01875000074505806, "clip_ratio/high_mean": 0.00937500037252903, "clip_ratio/low_mean": 0.008713942486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018088942859321833, "entropy": 0.7015666477382183, "epoch": 0.00878, "grad_norm": 0.19112743437290192, "kl": 0.7671960517764091, "learning_rate": 9.999917987577637e-06, "loss": -0.0433, "step": 878, "step_time": 7.648091569997632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 774.0, "completions/max_terminated_length": 774.0, "completions/mean_length": 114.3125, "completions/mean_terminated_length": 114.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5827828366309404, "epoch": 0.00879, "frac_reward_zero_std": 0.5, "grad_norm": 0.19562838971614838, "kl": 0.3282871302217245, "learning_rate": 9.999917792658816e-06, "loss": 0.0092, "num_tokens": 20448885.0, "reward": 3.801835536956787, "reward_std": 0.5550940632820129, "rewards/rollout_reward_func/mean": 3.801835536956787, "rewards/rollout_reward_func/std": 1.1715195178985596, "sampling/importance_sampling_ratio/max": 0.9848823547363281, "sampling/importance_sampling_ratio/mean": 0.8525406122207642, "sampling/importance_sampling_ratio/min": 0.3471878170967102, "sampling/sampling_logp_difference/max": 0.9281833171844482, "sampling/sampling_logp_difference/mean": 0.054137952625751495, "step": 879, "step_time": 8.363840031997825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6256113033741713, "epoch": 0.0088, "grad_norm": 0.20129947364330292, "kl": 0.33231525775045156, "learning_rate": 9.999917597508642e-06, "loss": 0.009, "step": 880, "step_time": 4.589580460000434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1780.0, "completions/max_terminated_length": 1780.0, "completions/mean_length": 611.5, "completions/mean_terminated_length": 611.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6088215336203575, "epoch": 0.00881, "frac_reward_zero_std": 0.0, "grad_norm": 0.2687864899635315, "kl": 1.1410496979951859, "learning_rate": 9.999917402127112e-06, "loss": 0.0077, "num_tokens": 20506298.0, "reward": 1.0189056396484375, "reward_std": 2.623293399810791, "rewards/rollout_reward_func/mean": 1.0189056396484375, "rewards/rollout_reward_func/std": 3.7478959560394287, "sampling/importance_sampling_ratio/max": 1.0651272535324097, "sampling/importance_sampling_ratio/mean": 0.5623109340667725, "sampling/importance_sampling_ratio/min": 5.040488026963885e-10, "sampling/sampling_logp_difference/max": 9.154963493347168, "sampling/sampling_logp_difference/mean": 0.3179911673069, "step": 881, "step_time": 12.988383127001725 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.6047537215054035, "epoch": 0.00882, "grad_norm": 0.3138366639614105, "kl": 1.143685720860958, "learning_rate": 9.99991720651423e-06, "loss": 0.0077, "step": 882, "step_time": 8.004821032000109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.03125, "completions/max_length": 720.0, "completions/max_terminated_length": 720.0, "completions/mean_length": 268.09375, "completions/mean_terminated_length": 255.09677124023438, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.881147313863039, "epoch": 0.00883, "frac_reward_zero_std": 0.25, "grad_norm": 0.18893706798553467, "kl": 0.9800506681203842, "learning_rate": 9.99991701066999e-06, "loss": 0.0182, "num_tokens": 20550754.0, "reward": 2.2110018730163574, "reward_std": 1.2733592987060547, "rewards/rollout_reward_func/mean": 2.2110018730163574, "rewards/rollout_reward_func/std": 2.7418394088745117, "sampling/importance_sampling_ratio/max": 0.9764094948768616, "sampling/importance_sampling_ratio/mean": 0.761664628982544, "sampling/importance_sampling_ratio/min": 0.006448755972087383, "sampling/sampling_logp_difference/max": 2.210756301879883, "sampling/sampling_logp_difference/mean": 0.12601706385612488, "step": 883, "step_time": 8.855025248001766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.868976004421711, "epoch": 0.00884, "grad_norm": 0.17550744116306305, "kl": 0.9783035106956959, "learning_rate": 9.999916814594399e-06, "loss": 0.0182, "step": 884, "step_time": 4.76646129600158 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "completions/clipped_ratio": 0.0, "completions/max_length": 1169.0, "completions/max_terminated_length": 1169.0, "completions/mean_length": 355.21875, "completions/mean_terminated_length": 355.21875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9934021309018135, "epoch": 0.00885, "frac_reward_zero_std": 0.25, "grad_norm": 0.22299110889434814, "kl": 0.8998375125229359, "learning_rate": 9.999916618287452e-06, "loss": -0.0331, "num_tokens": 20597966.0, "reward": 2.535470724105835, "reward_std": 1.5112271308898926, "rewards/rollout_reward_func/mean": 2.535470724105835, "rewards/rollout_reward_func/std": 3.293426275253296, "sampling/importance_sampling_ratio/max": 1.0021673440933228, "sampling/importance_sampling_ratio/mean": 0.7378302812576294, "sampling/importance_sampling_ratio/min": 0.044792864471673965, "sampling/sampling_logp_difference/max": 1.9591089487075806, "sampling/sampling_logp_difference/mean": 0.10668325424194336, "step": 885, "step_time": 10.135288448997017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 0.9893896132707596, "epoch": 0.00886, "grad_norm": 0.16932794451713562, "kl": 0.8998971246182919, "learning_rate": 9.999916421749152e-06, "loss": -0.0332, "step": 886, "step_time": 6.0792807130019355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 505.96875, "completions/mean_terminated_length": 505.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3670655265450478, "epoch": 0.00887, "frac_reward_zero_std": 0.0, "grad_norm": 0.5474529266357422, "kl": 1.1596504300832748, "learning_rate": 9.999916224979497e-06, "loss": -0.0052, "num_tokens": 20651241.0, "reward": 2.4083101749420166, "reward_std": 2.6881661415100098, "rewards/rollout_reward_func/mean": 2.4083101749420166, "rewards/rollout_reward_func/std": 3.322561502456665, "sampling/importance_sampling_ratio/max": 0.9727878570556641, "sampling/importance_sampling_ratio/mean": 0.5918137431144714, "sampling/importance_sampling_ratio/min": 0.018887823447585106, "sampling/sampling_logp_difference/max": 2.341184377670288, "sampling/sampling_logp_difference/mean": 0.15744230151176453, "step": 887, "step_time": 12.485260780998942 }, { "clip_ratio/high_max": 0.07907197065651417, "clip_ratio/high_mean": 0.039535985328257084, "clip_ratio/low_mean": 0.03125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.07078598532825708, "entropy": 1.3456137031316757, "epoch": 0.00888, "grad_norm": 0.2202635556459427, "kl": 1.1262270733714104, "learning_rate": 9.999916027978489e-06, "loss": -0.0077, "step": 888, "step_time": 6.757766977996653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 3.0, "completions/max_terminated_length": 3.0, "completions/mean_length": 3.0, "completions/mean_terminated_length": 3.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.21114527992904186, "epoch": 0.00889, "frac_reward_zero_std": 1.0, "grad_norm": 0.00045823221444152296, "kl": 0.05451732035726309, "learning_rate": 9.999915830746127e-06, "loss": 0.0001, "num_tokens": 20680991.0, "reward": 3.9680702686309814, "reward_std": 0.0, "rewards/rollout_reward_func/mean": 3.9680702686309814, "rewards/rollout_reward_func/std": 0.04986308887600899, "sampling/importance_sampling_ratio/max": 0.9776897430419922, "sampling/importance_sampling_ratio/mean": 0.9597665667533875, "sampling/importance_sampling_ratio/min": 0.726131796836853, "sampling/sampling_logp_difference/max": 0.23355057835578918, "sampling/sampling_logp_difference/mean": 0.01420600712299347, "step": 889, "step_time": 5.149968652001917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.1983584128320217, "epoch": 0.0089, "grad_norm": 0.000346803804859519, "kl": 0.05420104321092367, "learning_rate": 9.99991563328241e-06, "loss": 0.0001, "step": 890, "step_time": 2.7974350610002148 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "completions/clipped_ratio": 0.0, "completions/max_length": 510.0, "completions/max_terminated_length": 510.0, "completions/mean_length": 199.0, "completions/mean_terminated_length": 199.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9623089656233788, "epoch": 0.00891, "frac_reward_zero_std": 0.5, "grad_norm": 0.09635137766599655, "kl": 0.37319544749334455, "learning_rate": 9.999915435587339e-06, "loss": -0.0098, "num_tokens": 20722486.0, "reward": 2.69134783744812, "reward_std": 0.4975012540817261, "rewards/rollout_reward_func/mean": 2.69134783744812, "rewards/rollout_reward_func/std": 2.781386613845825, "sampling/importance_sampling_ratio/max": 0.9770484566688538, "sampling/importance_sampling_ratio/mean": 0.7800455093383789, "sampling/importance_sampling_ratio/min": 5.227045611462208e-16, "sampling/sampling_logp_difference/max": 21.02372169494629, "sampling/sampling_logp_difference/mean": 0.2690100371837616, "step": 891, "step_time": 8.936077013000613 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.9623199179768562, "epoch": 0.00892, "grad_norm": 0.06798476725816727, "kl": 0.35744067281484604, "learning_rate": 9.999915237660913e-06, "loss": -0.0098, "step": 892, "step_time": 4.296705605995157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 492.0, "completions/max_terminated_length": 492.0, "completions/mean_length": 113.75, "completions/mean_terminated_length": 113.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.45942433178424835, "epoch": 0.00893, "frac_reward_zero_std": 0.5, "grad_norm": 0.23617051541805267, "kl": 0.41039042454212904, "learning_rate": 9.999915039503135e-06, "loss": -0.0071, "num_tokens": 20761603.0, "reward": 2.449173927307129, "reward_std": 0.018210088834166527, "rewards/rollout_reward_func/mean": 2.449173927307129, "rewards/rollout_reward_func/std": 3.0839529037475586, "sampling/importance_sampling_ratio/max": 1.3076921701431274, "sampling/importance_sampling_ratio/mean": 0.9130517244338989, "sampling/importance_sampling_ratio/min": 0.18583615124225616, "sampling/sampling_logp_difference/max": 1.382158875465393, "sampling/sampling_logp_difference/mean": 0.05516911670565605, "step": 893, "step_time": 7.606734574001166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.45923235826194286, "epoch": 0.00894, "grad_norm": 0.2254272699356079, "kl": 0.4165171952918172, "learning_rate": 9.999914841114002e-06, "loss": -0.0074, "step": 894, "step_time": 4.133801619998849 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "completions/clipped_ratio": 0.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 590.875, "completions/mean_terminated_length": 590.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9838472828269005, "epoch": 0.00895, "frac_reward_zero_std": 0.0, "grad_norm": 0.41109317541122437, "kl": 1.0208346880972385, "learning_rate": 9.999914642493514e-06, "loss": -0.0347, "num_tokens": 20818845.0, "reward": 2.1912200450897217, "reward_std": 3.341952323913574, "rewards/rollout_reward_func/mean": 2.1912200450897217, "rewards/rollout_reward_func/std": 3.2583611011505127, "sampling/importance_sampling_ratio/max": 0.975858747959137, "sampling/importance_sampling_ratio/mean": 0.6924425363540649, "sampling/importance_sampling_ratio/min": 0.006868033669888973, "sampling/sampling_logp_difference/max": 3.3270387649536133, "sampling/sampling_logp_difference/mean": 0.12990061938762665, "step": 895, "step_time": 12.851779399001316 }, { "clip_ratio/high_max": 0.014756944496184587, "clip_ratio/high_mean": 0.007378472248092294, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015190972248092294, "entropy": 0.9954831823706627, "epoch": 0.00896, "grad_norm": 0.2723575532436371, "kl": 1.0299601629376411, "learning_rate": 9.999914443641672e-06, "loss": -0.0357, "step": 896, "step_time": 6.775321948000055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 852.0, "completions/max_terminated_length": 852.0, "completions/mean_length": 458.875, "completions/mean_terminated_length": 458.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8035610653460026, "epoch": 0.00897, "frac_reward_zero_std": 0.0, "grad_norm": 0.3052779734134674, "kl": 0.9039426371455193, "learning_rate": 9.999914244558477e-06, "loss": -0.0381, "num_tokens": 20872218.0, "reward": 2.2082388401031494, "reward_std": 2.3855772018432617, "rewards/rollout_reward_func/mean": 2.2082388401031494, "rewards/rollout_reward_func/std": 3.547469139099121, "sampling/importance_sampling_ratio/max": 1.0795016288757324, "sampling/importance_sampling_ratio/mean": 0.7958283424377441, "sampling/importance_sampling_ratio/min": 0.0037662209942936897, "sampling/sampling_logp_difference/max": 1.4187425374984741, "sampling/sampling_logp_difference/mean": 0.09835996478796005, "step": 897, "step_time": 9.362241949000236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666744276881, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666744276881, "entropy": 0.8269752226769924, "epoch": 0.00898, "grad_norm": 0.24402044713497162, "kl": 0.9100581258535385, "learning_rate": 9.999914045243927e-06, "loss": -0.0387, "step": 898, "step_time": 5.065920774997721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 126.3125, "completions/mean_terminated_length": 126.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5279658548533916, "epoch": 0.00899, "frac_reward_zero_std": 0.5, "grad_norm": 0.2651228606700897, "kl": 0.4233944844454527, "learning_rate": 9.999913845698023e-06, "loss": -0.0118, "num_tokens": 20909810.0, "reward": 3.109096050262451, "reward_std": 1.7162015438079834, "rewards/rollout_reward_func/mean": 3.109096050262451, "rewards/rollout_reward_func/std": 2.4936091899871826, "sampling/importance_sampling_ratio/max": 0.9774038791656494, "sampling/importance_sampling_ratio/mean": 0.8715282678604126, "sampling/importance_sampling_ratio/min": 0.2614300549030304, "sampling/sampling_logp_difference/max": 1.11661696434021, "sampling/sampling_logp_difference/mean": 0.047692909836769104, "step": 899, "step_time": 8.225808382998366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.5341678503900766, "epoch": 0.009, "grad_norm": 0.2677895128726959, "kl": 0.4211872275918722, "learning_rate": 9.999913645920764e-06, "loss": -0.0126, "step": 900, "step_time": 4.272219399999813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 141.5625, "completions/mean_terminated_length": 141.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.0597489289939404, "epoch": 0.00901, "frac_reward_zero_std": 0.0, "grad_norm": 0.6535152196884155, "kl": 0.8021267196163535, "learning_rate": 9.999913445912153e-06, "loss": -0.0632, "num_tokens": 20951127.0, "reward": -1.6111335754394531, "reward_std": 1.9350512027740479, "rewards/rollout_reward_func/mean": -1.6111335754394531, "rewards/rollout_reward_func/std": 2.6192469596862793, "sampling/importance_sampling_ratio/max": 1.0670446157455444, "sampling/importance_sampling_ratio/mean": 0.7796983122825623, "sampling/importance_sampling_ratio/min": 0.051336873322725296, "sampling/sampling_logp_difference/max": 2.234114408493042, "sampling/sampling_logp_difference/mean": 0.11724495887756348, "step": 901, "step_time": 7.547798083000089 }, { "clip_ratio/high_max": 0.04513888992369175, "clip_ratio/high_mean": 0.022569444961845875, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.047569445334374905, "entropy": 1.0817515663802624, "epoch": 0.00902, "grad_norm": 0.29911407828330994, "kl": 0.7739024804905057, "learning_rate": 9.999913245672189e-06, "loss": -0.0645, "step": 902, "step_time": 4.283779678997234 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1872.0, "completions/max_terminated_length": 1872.0, "completions/mean_length": 686.46875, "completions/mean_terminated_length": 686.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4082800522446632, "epoch": 0.00903, "frac_reward_zero_std": 0.0, "grad_norm": 0.7393279671669006, "kl": 1.2068448066711426, "learning_rate": 9.99991304520087e-06, "loss": -0.0483, "num_tokens": 21011765.0, "reward": 1.6333062648773193, "reward_std": 3.5888352394104004, "rewards/rollout_reward_func/mean": 1.6333062648773193, "rewards/rollout_reward_func/std": 3.515606164932251, "sampling/importance_sampling_ratio/max": 1.093666672706604, "sampling/importance_sampling_ratio/mean": 0.5303647518157959, "sampling/importance_sampling_ratio/min": 0.03560701385140419, "sampling/sampling_logp_difference/max": 1.4940670728683472, "sampling/sampling_logp_difference/mean": 0.16209270060062408, "step": 903, "step_time": 13.482858391002082 }, { "clip_ratio/high_max": 0.02818627515807748, "clip_ratio/high_mean": 0.01409313757903874, "clip_ratio/low_mean": 0.022773693315684795, "clip_ratio/low_min": 0.007352941203862429, "clip_ratio/region_mean": 0.03686683066189289, "entropy": 1.4372473284602165, "epoch": 0.00904, "grad_norm": 0.4308069348335266, "kl": 1.2273168861865997, "learning_rate": 9.999912844498194e-06, "loss": -0.0513, "step": 904, "step_time": 7.634646476002672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1263.0, "completions/max_terminated_length": 1263.0, "completions/mean_length": 540.09375, "completions/mean_terminated_length": 540.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5420328378677368, "epoch": 0.00905, "frac_reward_zero_std": 0.0, "grad_norm": 0.1881643682718277, "kl": 1.3311254270374775, "learning_rate": 9.999912643564166e-06, "loss": -0.0237, "num_tokens": 21068072.0, "reward": 0.7113498449325562, "reward_std": 1.8105189800262451, "rewards/rollout_reward_func/mean": 0.7113498449325562, "rewards/rollout_reward_func/std": 3.65301775932312, "sampling/importance_sampling_ratio/max": 0.9600470066070557, "sampling/importance_sampling_ratio/mean": 0.5329595804214478, "sampling/importance_sampling_ratio/min": 0.00015294285549316555, "sampling/sampling_logp_difference/max": 3.0306644439697266, "sampling/sampling_logp_difference/mean": 0.24289771914482117, "step": 905, "step_time": 10.971939439999915 }, { "clip_ratio/high_max": 0.005681818351149559, "clip_ratio/high_mean": 0.0028409091755747795, "clip_ratio/low_mean": 0.026041666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.028882576152682304, "entropy": 1.5905530080199242, "epoch": 0.00906, "grad_norm": 0.18443626165390015, "kl": 1.3619620725512505, "learning_rate": 9.999912442398785e-06, "loss": -0.0243, "step": 906, "step_time": 5.945841430000655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "completions/clipped_ratio": 0.03125, "completions/max_length": 1124.0, "completions/max_terminated_length": 1124.0, "completions/mean_length": 420.15625, "completions/mean_terminated_length": 417.9031982421875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9625057950615883, "epoch": 0.00907, "frac_reward_zero_std": 0.0, "grad_norm": 0.26200467348098755, "kl": 1.0506520494818687, "learning_rate": 9.99991224100205e-06, "loss": -0.0491, "num_tokens": 21120826.0, "reward": 3.7301628589630127, "reward_std": 2.0767486095428467, "rewards/rollout_reward_func/mean": 3.7301628589630127, "rewards/rollout_reward_func/std": 2.6668312549591064, "sampling/importance_sampling_ratio/max": 0.9491474032402039, "sampling/importance_sampling_ratio/mean": 0.7351585626602173, "sampling/importance_sampling_ratio/min": 0.00012174159201094881, "sampling/sampling_logp_difference/max": 2.622434139251709, "sampling/sampling_logp_difference/mean": 0.16939297318458557, "step": 907, "step_time": 10.18219348099774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0028409091755747795, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0028409091755747795, "entropy": 0.9695286434143782, "epoch": 0.00908, "grad_norm": 0.2380310595035553, "kl": 1.0523380972445011, "learning_rate": 9.99991203937396e-06, "loss": -0.0494, "step": 908, "step_time": 6.488187412998741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "completions/clipped_ratio": 0.0, "completions/max_length": 543.0, "completions/max_terminated_length": 543.0, "completions/mean_length": 173.75, "completions/mean_terminated_length": 173.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.103493094444275, "epoch": 0.00909, "frac_reward_zero_std": 0.5, "grad_norm": 0.22642815113067627, "kl": 1.3203633110970259, "learning_rate": 9.999911837514517e-06, "loss": -0.0082, "num_tokens": 21160158.0, "reward": 1.3846323490142822, "reward_std": 1.4265515804290771, "rewards/rollout_reward_func/mean": 1.3846323490142822, "rewards/rollout_reward_func/std": 3.1884915828704834, "sampling/importance_sampling_ratio/max": 0.9756865501403809, "sampling/importance_sampling_ratio/mean": 0.6883429884910583, "sampling/importance_sampling_ratio/min": 0.02682875283062458, "sampling/sampling_logp_difference/max": 2.7109663486480713, "sampling/sampling_logp_difference/mean": 0.1429286003112793, "step": 909, "step_time": 7.921018865001315 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01927083358168602, "entropy": 1.1512816175818443, "epoch": 0.0091, "grad_norm": 0.21117205917835236, "kl": 1.3391930274665356, "learning_rate": 9.99991163542372e-06, "loss": -0.0088, "step": 910, "step_time": 4.1506460109994805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1265.0, "completions/max_terminated_length": 1265.0, "completions/mean_length": 270.0625, "completions/mean_terminated_length": 270.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9201961420476437, "epoch": 0.00911, "frac_reward_zero_std": 0.5, "grad_norm": 0.19703784584999084, "kl": 1.0515190362930298, "learning_rate": 9.999911433101569e-06, "loss": -0.0247, "num_tokens": 21203741.0, "reward": 0.39730870723724365, "reward_std": 1.814002513885498, "rewards/rollout_reward_func/mean": 0.39730870723724365, "rewards/rollout_reward_func/std": 3.3892335891723633, "sampling/importance_sampling_ratio/max": 1.2833478450775146, "sampling/importance_sampling_ratio/mean": 0.7810798287391663, "sampling/importance_sampling_ratio/min": 0.016227591782808304, "sampling/sampling_logp_difference/max": 2.7826731204986572, "sampling/sampling_logp_difference/mean": 0.12585243582725525, "step": 911, "step_time": 10.14046935099941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.9506214000284672, "epoch": 0.00912, "grad_norm": 0.21331095695495605, "kl": 1.0503503158688545, "learning_rate": 9.999911230548063e-06, "loss": -0.0251, "step": 912, "step_time": 6.843102637998527 }, { "clip_ratio/high_max": 0.014823718462139368, "clip_ratio/high_mean": 0.007411859231069684, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007411859231069684, "completions/clipped_ratio": 0.0, "completions/max_length": 1250.0, "completions/max_terminated_length": 1250.0, "completions/mean_length": 476.65625, "completions/mean_terminated_length": 476.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7030697297304869, "epoch": 0.00913, "frac_reward_zero_std": 0.5, "grad_norm": 0.20040228962898254, "kl": 1.1850656233727932, "learning_rate": 9.999911027763205e-06, "loss": -0.0172, "num_tokens": 21255402.0, "reward": 3.167853355407715, "reward_std": 1.8093163967132568, "rewards/rollout_reward_func/mean": 3.167853355407715, "rewards/rollout_reward_func/std": 2.8817570209503174, "sampling/importance_sampling_ratio/max": 0.9736384749412537, "sampling/importance_sampling_ratio/mean": 0.577487587928772, "sampling/importance_sampling_ratio/min": 8.292124107356358e-07, "sampling/sampling_logp_difference/max": 2.977182149887085, "sampling/sampling_logp_difference/mean": 0.3060202896595001, "step": 913, "step_time": 10.76795443699666 }, { "clip_ratio/high_max": 0.02003205195069313, "clip_ratio/high_mean": 0.010016025975346565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "entropy": 1.7220117673277855, "epoch": 0.00914, "grad_norm": 0.20071186125278473, "kl": 1.1437643263489008, "learning_rate": 9.99991082474699e-06, "loss": -0.0174, "step": 914, "step_time": 5.994825313999172 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1174.0, "completions/max_terminated_length": 1174.0, "completions/mean_length": 423.875, "completions/mean_terminated_length": 423.875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4993038550019264, "epoch": 0.00915, "frac_reward_zero_std": 0.25, "grad_norm": 0.2569895088672638, "kl": 1.3319231979548931, "learning_rate": 9.999910621499425e-06, "loss": -0.0393, "num_tokens": 21304716.0, "reward": 2.1650896072387695, "reward_std": 2.388371706008911, "rewards/rollout_reward_func/mean": 2.1650896072387695, "rewards/rollout_reward_func/std": 2.9489541053771973, "sampling/importance_sampling_ratio/max": 0.9693811535835266, "sampling/importance_sampling_ratio/mean": 0.6140410900115967, "sampling/importance_sampling_ratio/min": 0.0005085545708425343, "sampling/sampling_logp_difference/max": 3.5980911254882812, "sampling/sampling_logp_difference/mean": 0.2626664936542511, "step": 915, "step_time": 10.412661974996809 }, { "clip_ratio/high_max": 0.007352941203862429, "clip_ratio/high_mean": 0.0036764706019312143, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0036764706019312143, "entropy": 1.4804516956210136, "epoch": 0.00916, "grad_norm": 0.16817735135555267, "kl": 1.2735405340790749, "learning_rate": 9.999910418020505e-06, "loss": -0.0394, "step": 916, "step_time": 6.167936928999552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002314814832061529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "completions/clipped_ratio": 0.0, "completions/max_length": 1223.0, "completions/max_terminated_length": 1223.0, "completions/mean_length": 547.375, "completions/mean_terminated_length": 547.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8019081503152847, "epoch": 0.00917, "frac_reward_zero_std": 0.0, "grad_norm": 0.16650119423866272, "kl": 1.3796225562691689, "learning_rate": 9.999910214310228e-06, "loss": -0.076, "num_tokens": 21360527.0, "reward": 0.7258013486862183, "reward_std": 3.249577045440674, "rewards/rollout_reward_func/mean": 0.7258013486862183, "rewards/rollout_reward_func/std": 3.664496898651123, "sampling/importance_sampling_ratio/max": 0.9506843686103821, "sampling/importance_sampling_ratio/mean": 0.49803268909454346, "sampling/importance_sampling_ratio/min": 0.00014760559133719653, "sampling/sampling_logp_difference/max": 2.311878204345703, "sampling/sampling_logp_difference/mean": 0.24213582277297974, "step": 917, "step_time": 10.929543160998946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004629629664123058, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004629629664123058, "entropy": 1.8000476509332657, "epoch": 0.00918, "grad_norm": 0.18221738934516907, "kl": 1.3688920810818672, "learning_rate": 9.999910010368603e-06, "loss": -0.0765, "step": 918, "step_time": 5.917462581999644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 229.71875, "completions/mean_terminated_length": 229.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6927301809191704, "epoch": 0.00919, "frac_reward_zero_std": 0.5, "grad_norm": 0.12460623681545258, "kl": 1.0891753733158112, "learning_rate": 9.99990980619562e-06, "loss": -0.0086, "num_tokens": 21403001.0, "reward": 3.371082305908203, "reward_std": 1.417203664779663, "rewards/rollout_reward_func/mean": 3.371082305908203, "rewards/rollout_reward_func/std": 2.237487554550171, "sampling/importance_sampling_ratio/max": 0.9743579030036926, "sampling/importance_sampling_ratio/mean": 0.7963671088218689, "sampling/importance_sampling_ratio/min": 0.06850207597017288, "sampling/sampling_logp_difference/max": 2.5777697563171387, "sampling/sampling_logp_difference/mean": 0.0753079205751419, "step": 919, "step_time": 9.907085288999951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7064928784966469, "epoch": 0.0092, "grad_norm": 0.1264650672674179, "kl": 1.0911113498732448, "learning_rate": 9.999909601791285e-06, "loss": -0.0087, "step": 920, "step_time": 5.8811288839988265 }, { "clip_ratio/high_max": 0.019852941390126944, "clip_ratio/high_mean": 0.009926470695063472, "clip_ratio/low_mean": 0.0036764706019312143, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013602941296994686, "completions/clipped_ratio": 0.0, "completions/max_length": 1488.0, "completions/max_terminated_length": 1488.0, "completions/mean_length": 625.28125, "completions/mean_terminated_length": 625.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.0651669800281525, "epoch": 0.00921, "frac_reward_zero_std": 0.0, "grad_norm": 0.15208402276039124, "kl": 1.3343398943543434, "learning_rate": 9.999909397155596e-06, "loss": -0.0211, "num_tokens": 21460502.0, "reward": -0.07566618919372559, "reward_std": 2.614938974380493, "rewards/rollout_reward_func/mean": -0.07566618919372559, "rewards/rollout_reward_func/std": 3.3695688247680664, "sampling/importance_sampling_ratio/max": 0.9756170511245728, "sampling/importance_sampling_ratio/mean": 0.3559045195579529, "sampling/importance_sampling_ratio/min": 1.8195293410981937e-18, "sampling/sampling_logp_difference/max": 19.359617233276367, "sampling/sampling_logp_difference/mean": 0.6831107139587402, "step": 921, "step_time": 12.873934301998815 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.006517379777505994, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009493570076301694, "entropy": 2.0629129707813263, "epoch": 0.00922, "grad_norm": 0.3056183457374573, "kl": 1.3325571864843369, "learning_rate": 9.999909192288553e-06, "loss": -0.0207, "step": 922, "step_time": 6.526948578000884 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0029761905316263437, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010788690531626344, "completions/clipped_ratio": 0.0625, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 464.5, "completions/mean_terminated_length": 480.16668701171875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6525126993656158, "epoch": 0.00923, "frac_reward_zero_std": 0.25, "grad_norm": 0.06614259630441666, "kl": 0.9647031165659428, "learning_rate": 9.999908987190155e-06, "loss": -0.0403, "num_tokens": 21512398.0, "reward": -0.14048779010772705, "reward_std": 2.1822509765625, "rewards/rollout_reward_func/mean": -0.14048779010772705, "rewards/rollout_reward_func/std": 3.30487060546875, "sampling/importance_sampling_ratio/max": 0.9755692481994629, "sampling/importance_sampling_ratio/mean": 0.5106343626976013, "sampling/importance_sampling_ratio/min": 8.227273941145651e-33, "sampling/sampling_logp_difference/max": 14.448917388916016, "sampling/sampling_logp_difference/mean": 0.508768618106842, "step": 923, "step_time": 10.687145349997081 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.01076007355004549, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01466632355004549, "entropy": 1.6525367572903633, "epoch": 0.00924, "grad_norm": 0.06875814497470856, "kl": 0.960365254431963, "learning_rate": 9.999908781860404e-06, "loss": -0.0404, "step": 924, "step_time": 5.818428117001531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1230.0, "completions/max_terminated_length": 1230.0, "completions/mean_length": 238.4375, "completions/mean_terminated_length": 238.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.018757175654173, "epoch": 0.00925, "frac_reward_zero_std": 0.75, "grad_norm": 0.07704579830169678, "kl": 0.7328871740028262, "learning_rate": 9.9999085762993e-06, "loss": -0.0114, "num_tokens": 21553670.0, "reward": 3.4280595779418945, "reward_std": 0.9124286770820618, "rewards/rollout_reward_func/mean": 3.4280595779418945, "rewards/rollout_reward_func/std": 1.9723618030548096, "sampling/importance_sampling_ratio/max": 0.9767794013023376, "sampling/importance_sampling_ratio/mean": 0.7437124252319336, "sampling/importance_sampling_ratio/min": 0.005770792718976736, "sampling/sampling_logp_difference/max": 2.173508882522583, "sampling/sampling_logp_difference/mean": 0.1502675563097, "step": 925, "step_time": 10.72543093099921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.0174372289329767, "epoch": 0.00926, "grad_norm": 0.07807435840368271, "kl": 0.7274854094721377, "learning_rate": 9.999908370506842e-06, "loss": -0.0114, "step": 926, "step_time": 5.561385190998408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 1115.0, "completions/max_terminated_length": 1115.0, "completions/mean_length": 227.53125, "completions/mean_terminated_length": 227.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.39547245390713215, "epoch": 0.00927, "frac_reward_zero_std": 0.5, "grad_norm": 0.1963089257478714, "kl": 0.5909473225474358, "learning_rate": 9.99990816448303e-06, "loss": -0.0311, "num_tokens": 21595745.0, "reward": 3.617781400680542, "reward_std": 1.4485862255096436, "rewards/rollout_reward_func/mean": 3.617781400680542, "rewards/rollout_reward_func/std": 2.0249249935150146, "sampling/importance_sampling_ratio/max": 0.9747621417045593, "sampling/importance_sampling_ratio/mean": 0.8827989101409912, "sampling/importance_sampling_ratio/min": 8.947325240723238e-19, "sampling/sampling_logp_difference/max": 18.339799880981445, "sampling/sampling_logp_difference/mean": 0.28449660539627075, "step": 927, "step_time": 9.708648762998564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "entropy": 0.38770069368183613, "epoch": 0.00928, "grad_norm": 0.21337668597698212, "kl": 0.59040841832757, "learning_rate": 9.999907958227866e-06, "loss": -0.031, "step": 928, "step_time": 5.549950542997976 }, { "clip_ratio/high_max": 0.03393665235489607, "clip_ratio/high_mean": 0.016968326177448034, "clip_ratio/low_mean": 0.008653846336528659, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025622172513976693, "completions/clipped_ratio": 0.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 490.28125, "completions/mean_terminated_length": 490.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3214042037725449, "epoch": 0.00929, "frac_reward_zero_std": 0.0, "grad_norm": 1.1615716218948364, "kl": 4.347385726869106, "learning_rate": 9.999907751741346e-06, "loss": -0.0224, "num_tokens": 21650596.0, "reward": 2.037332057952881, "reward_std": 2.293447732925415, "rewards/rollout_reward_func/mean": 2.037332057952881, "rewards/rollout_reward_func/std": 3.453397274017334, "sampling/importance_sampling_ratio/max": 0.9673981070518494, "sampling/importance_sampling_ratio/mean": 0.6419576406478882, "sampling/importance_sampling_ratio/min": 7.020540290607336e-13, "sampling/sampling_logp_difference/max": 13.81716251373291, "sampling/sampling_logp_difference/mean": 0.28765159845352173, "step": 929, "step_time": 11.245032332000847 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.006570513127371669, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015499084955081344, "entropy": 1.3392175883054733, "epoch": 0.0093, "grad_norm": 0.30224931240081787, "kl": 2.2426757849752903, "learning_rate": 9.999907545023473e-06, "loss": -0.0274, "step": 930, "step_time": 5.608869821002372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 935.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 310.75, "completions/mean_terminated_length": 310.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8687414862215519, "epoch": 0.00931, "frac_reward_zero_std": 0.25, "grad_norm": 0.19927920401096344, "kl": 0.7858500331640244, "learning_rate": 9.999907338074246e-06, "loss": -0.0562, "num_tokens": 21697011.0, "reward": 3.440100908279419, "reward_std": 1.607609510421753, "rewards/rollout_reward_func/mean": 3.440100908279419, "rewards/rollout_reward_func/std": 2.5475571155548096, "sampling/importance_sampling_ratio/max": 0.9753411412239075, "sampling/importance_sampling_ratio/mean": 0.7521929740905762, "sampling/importance_sampling_ratio/min": 0.004095300100743771, "sampling/sampling_logp_difference/max": 3.337043523788452, "sampling/sampling_logp_difference/mean": 0.1395970731973648, "step": 931, "step_time": 9.796559282001908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8813351280987263, "epoch": 0.00932, "grad_norm": 0.2122727930545807, "kl": 0.7637405246496201, "learning_rate": 9.999907130893666e-06, "loss": -0.0561, "step": 932, "step_time": 5.213238671001818 }, { "clip_ratio/high_max": 0.008928571827709675, "clip_ratio/high_mean": 0.004464285913854837, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 328.0, "completions/mean_terminated_length": 328.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.32987554743886, "epoch": 0.00933, "frac_reward_zero_std": 0.0, "grad_norm": 0.16126692295074463, "kl": 1.4746473096311092, "learning_rate": 9.999906923481732e-06, "loss": -0.0249, "num_tokens": 21743777.0, "reward": 3.1333773136138916, "reward_std": 2.2377328872680664, "rewards/rollout_reward_func/mean": 3.1333773136138916, "rewards/rollout_reward_func/std": 2.9289753437042236, "sampling/importance_sampling_ratio/max": 0.9807685017585754, "sampling/importance_sampling_ratio/mean": 0.7248220443725586, "sampling/importance_sampling_ratio/min": 0.0007170127355493605, "sampling/sampling_logp_difference/max": 2.1970465183258057, "sampling/sampling_logp_difference/mean": 0.1732548326253891, "step": 933, "step_time": 10.76512205400104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.3581970781087875, "epoch": 0.00934, "grad_norm": 0.17937108874320984, "kl": 1.4777687266469002, "learning_rate": 9.999906715838447e-06, "loss": -0.0251, "step": 934, "step_time": 5.378971756999817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015625, "completions/clipped_ratio": 0.0, "completions/max_length": 1113.0, "completions/max_terminated_length": 1113.0, "completions/mean_length": 331.28125, "completions/mean_terminated_length": 331.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2363539226353168, "epoch": 0.00935, "frac_reward_zero_std": 0.25, "grad_norm": 0.15652483701705933, "kl": 0.9583714436739683, "learning_rate": 9.999906507963806e-06, "loss": 0.005, "num_tokens": 21790293.0, "reward": 0.10382470488548279, "reward_std": 2.0783283710479736, "rewards/rollout_reward_func/mean": 0.10382470488548279, "rewards/rollout_reward_func/std": 3.0095794200897217, "sampling/importance_sampling_ratio/max": 0.9765172004699707, "sampling/importance_sampling_ratio/mean": 0.638792872428894, "sampling/importance_sampling_ratio/min": 0.0033468978945165873, "sampling/sampling_logp_difference/max": 3.0147550106048584, "sampling/sampling_logp_difference/mean": 0.2046303153038025, "step": 935, "step_time": 9.859415411001464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.2427086010575294, "epoch": 0.00936, "grad_norm": 0.3091643750667572, "kl": 0.917035867460072, "learning_rate": 9.99990629985781e-06, "loss": 0.0052, "step": 936, "step_time": 5.598189110000021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 993.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 337.75, "completions/mean_terminated_length": 332.58062744140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1008129976689816, "epoch": 0.00937, "frac_reward_zero_std": 0.25, "grad_norm": 0.31276148557662964, "kl": 0.836087204515934, "learning_rate": 9.999906091520464e-06, "loss": -0.0708, "num_tokens": 21837842.0, "reward": 3.0321452617645264, "reward_std": 2.46176815032959, "rewards/rollout_reward_func/mean": 3.0321452617645264, "rewards/rollout_reward_func/std": 2.84768009185791, "sampling/importance_sampling_ratio/max": 1.072562575340271, "sampling/importance_sampling_ratio/mean": 0.7474777102470398, "sampling/importance_sampling_ratio/min": 2.661484722921159e-05, "sampling/sampling_logp_difference/max": 2.0239768028259277, "sampling/sampling_logp_difference/mean": 0.17368674278259277, "step": 937, "step_time": 10.29493221499797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.105834499001503, "epoch": 0.00938, "grad_norm": 0.37910687923431396, "kl": 0.8376958258450031, "learning_rate": 9.999905882951763e-06, "loss": -0.0704, "step": 938, "step_time": 5.882831901002646 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 351.34375, "completions/mean_terminated_length": 351.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.3894302397966385, "epoch": 0.00939, "frac_reward_zero_std": 0.25, "grad_norm": 0.19636528193950653, "kl": 0.713177852332592, "learning_rate": 9.999905674151706e-06, "loss": -0.0388, "num_tokens": 21885152.0, "reward": 1.5599035024642944, "reward_std": 2.583451271057129, "rewards/rollout_reward_func/mean": 1.5599035024642944, "rewards/rollout_reward_func/std": 3.45617938041687, "sampling/importance_sampling_ratio/max": 0.9766248464584351, "sampling/importance_sampling_ratio/mean": 0.6654432415962219, "sampling/importance_sampling_ratio/min": 0.0009704749099910259, "sampling/sampling_logp_difference/max": 2.9864487648010254, "sampling/sampling_logp_difference/mean": 0.21420559287071228, "step": 939, "step_time": 9.515871097000854 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012276785913854837, "entropy": 1.38404730707407, "epoch": 0.0094, "grad_norm": 0.19435201585292816, "kl": 0.7064904849976301, "learning_rate": 9.999905465120299e-06, "loss": -0.0403, "step": 940, "step_time": 5.244949973997791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 251.375, "completions/mean_terminated_length": 251.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0193335823714733, "epoch": 0.00941, "frac_reward_zero_std": 0.25, "grad_norm": 0.22466538846492767, "kl": 0.9319156594574451, "learning_rate": 9.999905255857538e-06, "loss": -0.052, "num_tokens": 21929149.0, "reward": 1.6202985048294067, "reward_std": 1.763002634048462, "rewards/rollout_reward_func/mean": 1.6202985048294067, "rewards/rollout_reward_func/std": 3.428342342376709, "sampling/importance_sampling_ratio/max": 1.0000070333480835, "sampling/importance_sampling_ratio/mean": 0.745766282081604, "sampling/importance_sampling_ratio/min": 0.0013719015987589955, "sampling/sampling_logp_difference/max": 2.070451259613037, "sampling/sampling_logp_difference/mean": 0.13476206362247467, "step": 941, "step_time": 9.88490213300247 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.001370768994093, "epoch": 0.00942, "grad_norm": 0.21067897975444794, "kl": 0.9294730611145496, "learning_rate": 9.999905046363422e-06, "loss": -0.0527, "step": 942, "step_time": 6.1529924080005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 210.15625, "completions/mean_terminated_length": 195.9354705810547, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7513831704854965, "epoch": 0.00943, "frac_reward_zero_std": 0.5, "grad_norm": 0.2276192307472229, "kl": 0.7056524427607656, "learning_rate": 9.999904836637952e-06, "loss": -0.0009, "num_tokens": 21970648.0, "reward": 3.104433536529541, "reward_std": 0.702175498008728, "rewards/rollout_reward_func/mean": 3.104433536529541, "rewards/rollout_reward_func/std": 2.4587559700012207, "sampling/importance_sampling_ratio/max": 0.9762285351753235, "sampling/importance_sampling_ratio/mean": 0.801337480545044, "sampling/importance_sampling_ratio/min": 0.004074425902217627, "sampling/sampling_logp_difference/max": 2.3123388290405273, "sampling/sampling_logp_difference/mean": 0.08888568729162216, "step": 943, "step_time": 9.803197172001092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.7359586916863918, "epoch": 0.00944, "grad_norm": 0.2193102240562439, "kl": 0.7045159693807364, "learning_rate": 9.999904626681129e-06, "loss": -0.0011, "step": 944, "step_time": 5.399833004001266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008564814925193787, "clip_ratio/low_min": 0.004629629664123058, "clip_ratio/region_mean": 0.008564814925193787, "completions/clipped_ratio": 0.0, "completions/max_length": 1627.0, "completions/max_terminated_length": 1627.0, "completions/mean_length": 564.28125, "completions/mean_terminated_length": 564.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1662882827222347, "epoch": 0.00945, "frac_reward_zero_std": 0.25, "grad_norm": 0.20175062119960785, "kl": 0.9469821378588676, "learning_rate": 9.999904416492954e-06, "loss": -0.019, "num_tokens": 22025397.0, "reward": 3.069274425506592, "reward_std": 2.3191254138946533, "rewards/rollout_reward_func/mean": 3.069274425506592, "rewards/rollout_reward_func/std": 2.8843393325805664, "sampling/importance_sampling_ratio/max": 1.023833155632019, "sampling/importance_sampling_ratio/mean": 0.6704005599021912, "sampling/importance_sampling_ratio/min": 3.3288671298647987e-13, "sampling/sampling_logp_difference/max": 12.999974250793457, "sampling/sampling_logp_difference/mean": 0.22512242197990417, "step": 945, "step_time": 11.933020380996822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002314814832061529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "entropy": 1.143497709184885, "epoch": 0.00946, "grad_norm": 0.177236407995224, "kl": 0.9353351593017578, "learning_rate": 9.999904206073426e-06, "loss": -0.0189, "step": 946, "step_time": 7.621519147000072 }, { "clip_ratio/high_max": 0.02142857201397419, "clip_ratio/high_mean": 0.010714286006987095, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010714286006987095, "completions/clipped_ratio": 0.0, "completions/max_length": 1514.0, "completions/max_terminated_length": 1514.0, "completions/mean_length": 647.09375, "completions/mean_terminated_length": 647.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.891018107533455, "epoch": 0.00947, "frac_reward_zero_std": 0.0, "grad_norm": 0.219405859708786, "kl": 1.3850853368639946, "learning_rate": 9.999903995422542e-06, "loss": -0.0503, "num_tokens": 22084378.0, "reward": 2.441446304321289, "reward_std": 3.5420351028442383, "rewards/rollout_reward_func/mean": 2.441446304321289, "rewards/rollout_reward_func/std": 3.5055601596832275, "sampling/importance_sampling_ratio/max": 0.9495251774787903, "sampling/importance_sampling_ratio/mean": 0.4496016502380371, "sampling/importance_sampling_ratio/min": 2.794237305707324e-19, "sampling/sampling_logp_difference/max": 12.913225173950195, "sampling/sampling_logp_difference/mean": 0.37229183316230774, "step": 947, "step_time": 11.883175598997695 }, { "clip_ratio/high_max": 0.03121657809242606, "clip_ratio/high_mean": 0.01560828904621303, "clip_ratio/low_mean": 0.011088329833000898, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02669661887921393, "entropy": 1.85268434882164, "epoch": 0.00948, "grad_norm": 0.2776240110397339, "kl": 1.476996161043644, "learning_rate": 9.999903784540305e-06, "loss": -0.0507, "step": 948, "step_time": 6.539877196999441 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1628.0, "completions/max_terminated_length": 1628.0, "completions/mean_length": 552.625, "completions/mean_terminated_length": 552.625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.204176813364029, "epoch": 0.00949, "frac_reward_zero_std": 0.0, "grad_norm": 0.30702176690101624, "kl": 0.9011413976550102, "learning_rate": 9.999903573426716e-06, "loss": -0.0229, "num_tokens": 22140329.0, "reward": 2.511946678161621, "reward_std": 2.305152177810669, "rewards/rollout_reward_func/mean": 2.511946678161621, "rewards/rollout_reward_func/std": 3.1055595874786377, "sampling/importance_sampling_ratio/max": 0.9694283604621887, "sampling/importance_sampling_ratio/mean": 0.6215726137161255, "sampling/importance_sampling_ratio/min": 0.022222623229026794, "sampling/sampling_logp_difference/max": 1.7773581743240356, "sampling/sampling_logp_difference/mean": 0.15133529901504517, "step": 949, "step_time": 11.79683425499752 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.021875000558793545, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021875000558793545, "entropy": 1.1978572085499763, "epoch": 0.0095, "grad_norm": 0.19833192229270935, "kl": 0.8663275092840195, "learning_rate": 9.999903362081772e-06, "loss": -0.0239, "step": 950, "step_time": 7.2741139179997845 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.017968750093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02277644257992506, "completions/clipped_ratio": 0.0, "completions/max_length": 1391.0, "completions/max_terminated_length": 1391.0, "completions/mean_length": 339.8125, "completions/mean_terminated_length": 339.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.691453279927373, "epoch": 0.00951, "frac_reward_zero_std": 0.25, "grad_norm": 0.279899924993515, "kl": 0.7251655738800764, "learning_rate": 9.999903150505478e-06, "loss": 0.0125, "num_tokens": 22187244.0, "reward": 1.046912431716919, "reward_std": 2.0597596168518066, "rewards/rollout_reward_func/mean": 1.046912431716919, "rewards/rollout_reward_func/std": 3.2923052310943604, "sampling/importance_sampling_ratio/max": 0.9736343026161194, "sampling/importance_sampling_ratio/mean": 0.575299859046936, "sampling/importance_sampling_ratio/min": 2.7199683774213677e-18, "sampling/sampling_logp_difference/max": 16.129711151123047, "sampling/sampling_logp_difference/mean": 0.6108942031860352, "step": 951, "step_time": 11.436897870000394 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01171875, "entropy": 1.6909706871956587, "epoch": 0.00952, "grad_norm": 0.17346276342868805, "kl": 0.7084530014544725, "learning_rate": 9.999902938697827e-06, "loss": 0.0119, "step": 952, "step_time": 6.196523752998473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1061.0, "completions/max_terminated_length": 1061.0, "completions/mean_length": 235.15625, "completions/mean_terminated_length": 235.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6844960823655128, "epoch": 0.00953, "frac_reward_zero_std": 0.5, "grad_norm": 0.2963378429412842, "kl": 0.4356670258566737, "learning_rate": 9.999902726658824e-06, "loss": -0.0269, "num_tokens": 22228801.0, "reward": 3.04504656791687, "reward_std": 1.7038389444351196, "rewards/rollout_reward_func/mean": 3.04504656791687, "rewards/rollout_reward_func/std": 2.495493173599243, "sampling/importance_sampling_ratio/max": 0.9761043190956116, "sampling/importance_sampling_ratio/mean": 0.803324282169342, "sampling/importance_sampling_ratio/min": 0.1286950409412384, "sampling/sampling_logp_difference/max": 1.2092030048370361, "sampling/sampling_logp_difference/mean": 0.07528349757194519, "step": 953, "step_time": 9.609751595000489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.6756969429552555, "epoch": 0.00954, "grad_norm": 0.26649409532546997, "kl": 0.4352977406233549, "learning_rate": 9.999902514388468e-06, "loss": -0.0276, "step": 954, "step_time": 5.910777030998361 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1058.0, "completions/max_terminated_length": 1058.0, "completions/mean_length": 400.8125, "completions/mean_terminated_length": 400.8125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7894075624644756, "epoch": 0.00955, "frac_reward_zero_std": 0.25, "grad_norm": 0.4383862018585205, "kl": 0.6930718366056681, "learning_rate": 9.999902301886758e-06, "loss": -0.003, "num_tokens": 22277995.0, "reward": 4.201257705688477, "reward_std": 0.9823904633522034, "rewards/rollout_reward_func/mean": 4.201257705688477, "rewards/rollout_reward_func/std": 1.7357641458511353, "sampling/importance_sampling_ratio/max": 1.0910332202911377, "sampling/importance_sampling_ratio/mean": 0.7660735845565796, "sampling/importance_sampling_ratio/min": 0.16521160304546356, "sampling/sampling_logp_difference/max": 1.4061264991760254, "sampling/sampling_logp_difference/mean": 0.08134962618350983, "step": 955, "step_time": 9.868992646001061 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 0.7829556204378605, "epoch": 0.00956, "grad_norm": 0.29176831245422363, "kl": 0.6911586206406355, "learning_rate": 9.999902089153695e-06, "loss": -0.0039, "step": 956, "step_time": 5.344928896000056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1298.0, "completions/max_terminated_length": 1298.0, "completions/mean_length": 363.25, "completions/mean_terminated_length": 363.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1390176936984062, "epoch": 0.00957, "frac_reward_zero_std": 0.0, "grad_norm": 0.2101103961467743, "kl": 1.1806566640734673, "learning_rate": 9.99990187618928e-06, "loss": -0.0364, "num_tokens": 22326590.0, "reward": 2.389768600463867, "reward_std": 3.0985257625579834, "rewards/rollout_reward_func/mean": 2.389768600463867, "rewards/rollout_reward_func/std": 3.373446464538574, "sampling/importance_sampling_ratio/max": 1.1365231275558472, "sampling/importance_sampling_ratio/mean": 0.7030957937240601, "sampling/importance_sampling_ratio/min": 0.03876598924398422, "sampling/sampling_logp_difference/max": 1.4241676330566406, "sampling/sampling_logp_difference/mean": 0.12277522683143616, "step": 957, "step_time": 10.541149480000968 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.1566676795482635, "epoch": 0.00958, "grad_norm": 0.19913563132286072, "kl": 1.1877775564789772, "learning_rate": 9.99990166299351e-06, "loss": -0.0368, "step": 958, "step_time": 6.482480846001636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 299.3125, "completions/mean_terminated_length": 299.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.618058368563652, "epoch": 0.00959, "frac_reward_zero_std": 0.5, "grad_norm": 0.3813503086566925, "kl": 0.7456014454364777, "learning_rate": 9.999901449566387e-06, "loss": 0.0056, "num_tokens": 22370871.0, "reward": 2.267953872680664, "reward_std": 0.8068459033966064, "rewards/rollout_reward_func/mean": 2.267953872680664, "rewards/rollout_reward_func/std": 3.3745598793029785, "sampling/importance_sampling_ratio/max": 0.9752769470214844, "sampling/importance_sampling_ratio/mean": 0.8073369264602661, "sampling/importance_sampling_ratio/min": 1.4774701466541379e-15, "sampling/sampling_logp_difference/max": 13.499998092651367, "sampling/sampling_logp_difference/mean": 0.2508825957775116, "step": 959, "step_time": 10.318389177999052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008653846336528659, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008653846336528659, "entropy": 0.6358408611267805, "epoch": 0.0096, "grad_norm": 0.19779658317565918, "kl": 0.8196255192160606, "learning_rate": 9.999901235907912e-06, "loss": 0.0049, "step": 960, "step_time": 5.670318345999476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 518.0, "completions/max_terminated_length": 518.0, "completions/mean_length": 194.0625, "completions/mean_terminated_length": 194.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6526913642883301, "epoch": 0.00961, "frac_reward_zero_std": 0.5, "grad_norm": 0.2759893834590912, "kl": 0.8031892371363938, "learning_rate": 9.999901022018081e-06, "loss": -0.0064, "num_tokens": 22412236.0, "reward": 2.6735455989837646, "reward_std": 1.2654085159301758, "rewards/rollout_reward_func/mean": 2.6735455989837646, "rewards/rollout_reward_func/std": 2.888068914413452, "sampling/importance_sampling_ratio/max": 0.9758802652359009, "sampling/importance_sampling_ratio/mean": 0.8635016083717346, "sampling/importance_sampling_ratio/min": 2.097888686675966e-17, "sampling/sampling_logp_difference/max": 20.121267318725586, "sampling/sampling_logp_difference/mean": 0.3096221089363098, "step": 961, "step_time": 7.786714950003443 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.6749190744012594, "epoch": 0.00962, "grad_norm": 0.23127050697803497, "kl": 0.8021639231592417, "learning_rate": 9.999900807896898e-06, "loss": -0.0072, "step": 962, "step_time": 4.249414851001347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1882.0, "completions/max_terminated_length": 1882.0, "completions/mean_length": 343.0, "completions/mean_terminated_length": 343.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7875005453824997, "epoch": 0.00963, "frac_reward_zero_std": 0.25, "grad_norm": 0.31861668825149536, "kl": 0.6595656741410494, "learning_rate": 9.999900593544362e-06, "loss": -0.0241, "num_tokens": 22458598.0, "reward": 3.047184944152832, "reward_std": 1.3783068656921387, "rewards/rollout_reward_func/mean": 3.047184944152832, "rewards/rollout_reward_func/std": 2.525599718093872, "sampling/importance_sampling_ratio/max": 0.993157148361206, "sampling/importance_sampling_ratio/mean": 0.7855409383773804, "sampling/importance_sampling_ratio/min": 0.08218888938426971, "sampling/sampling_logp_difference/max": 1.6369976997375488, "sampling/sampling_logp_difference/mean": 0.08787128329277039, "step": 963, "step_time": 12.619845306999196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8113647066056728, "epoch": 0.00964, "grad_norm": 0.2945137619972229, "kl": 0.6597204897552729, "learning_rate": 9.999900378960474e-06, "loss": -0.0251, "step": 964, "step_time": 7.137015716996757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.00390625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00390625, "completions/clipped_ratio": 0.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 694.3125, "completions/mean_terminated_length": 694.3125, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 0.8741144724190235, "epoch": 0.00965, "frac_reward_zero_std": 0.0, "grad_norm": 0.15921397507190704, "kl": 1.1057477369904518, "learning_rate": 9.999900164145231e-06, "loss": -0.0007, "num_tokens": 22520844.0, "reward": 3.6184163093566895, "reward_std": 1.8362153768539429, "rewards/rollout_reward_func/mean": 3.6184163093566895, "rewards/rollout_reward_func/std": 2.6250107288360596, "sampling/importance_sampling_ratio/max": 1.1785625219345093, "sampling/importance_sampling_ratio/mean": 0.6438170671463013, "sampling/importance_sampling_ratio/min": 5.278134436763075e-18, "sampling/sampling_logp_difference/max": 12.584242820739746, "sampling/sampling_logp_difference/mean": 0.27534377574920654, "step": 965, "step_time": 12.354919756999152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8899529278278351, "epoch": 0.00966, "grad_norm": 0.1728488951921463, "kl": 1.108468383550644, "learning_rate": 9.999899949098636e-06, "loss": -0.0011, "step": 966, "step_time": 6.826519220996488 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.03125, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 299.65625, "completions/mean_terminated_length": 292.9677429199219, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.6384041383862495, "epoch": 0.00967, "frac_reward_zero_std": 0.25, "grad_norm": 0.3297528922557831, "kl": 1.6013627350330353, "learning_rate": 9.999899733820688e-06, "loss": -0.0428, "num_tokens": 22567149.0, "reward": 1.527573823928833, "reward_std": 2.3808681964874268, "rewards/rollout_reward_func/mean": 1.527573823928833, "rewards/rollout_reward_func/std": 3.3711557388305664, "sampling/importance_sampling_ratio/max": 0.9736432433128357, "sampling/importance_sampling_ratio/mean": 0.5762228965759277, "sampling/importance_sampling_ratio/min": 0.0001273742673220113, "sampling/sampling_logp_difference/max": 2.290717363357544, "sampling/sampling_logp_difference/mean": 0.2451605200767517, "step": 967, "step_time": 10.797090527004912 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010489510837942362, "entropy": 1.6611352525651455, "epoch": 0.00968, "grad_norm": 0.25856637954711914, "kl": 1.7289058677852154, "learning_rate": 9.999899518311386e-06, "loss": -0.043, "step": 968, "step_time": 6.277742421001676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1000.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 147.40625, "completions/mean_terminated_length": 147.40625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6317494213581085, "epoch": 0.00969, "frac_reward_zero_std": 0.5, "grad_norm": 0.22203591465950012, "kl": 0.8523571863770485, "learning_rate": 9.99989930257073e-06, "loss": -0.0343, "num_tokens": 22606138.0, "reward": 0.9549648761749268, "reward_std": 0.983135461807251, "rewards/rollout_reward_func/mean": 0.9549648761749268, "rewards/rollout_reward_func/std": 3.2568042278289795, "sampling/importance_sampling_ratio/max": 0.975656270980835, "sampling/importance_sampling_ratio/mean": 0.6641311049461365, "sampling/importance_sampling_ratio/min": 0.003049147082492709, "sampling/sampling_logp_difference/max": 3.1196682453155518, "sampling/sampling_logp_difference/mean": 0.2787490487098694, "step": 969, "step_time": 9.368240288004017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.630617156624794, "epoch": 0.0097, "grad_norm": 0.2194519191980362, "kl": 0.8710319958627224, "learning_rate": 9.999899086598724e-06, "loss": -0.0343, "step": 970, "step_time": 5.325560708000921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 265.90625, "completions/mean_terminated_length": 265.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0741381645202637, "epoch": 0.00971, "frac_reward_zero_std": 0.5, "grad_norm": 0.07007502764463425, "kl": 0.6021126825362444, "learning_rate": 9.999898870395361e-06, "loss": -0.0133, "num_tokens": 22651184.0, "reward": 1.34357488155365, "reward_std": 1.3331828117370605, "rewards/rollout_reward_func/mean": 1.34357488155365, "rewards/rollout_reward_func/std": 3.644157648086548, "sampling/importance_sampling_ratio/max": 0.9849569797515869, "sampling/importance_sampling_ratio/mean": 0.7246544361114502, "sampling/importance_sampling_ratio/min": 0.0023870652075856924, "sampling/sampling_logp_difference/max": 2.025298595428467, "sampling/sampling_logp_difference/mean": 0.1707531213760376, "step": 971, "step_time": 9.03672319800171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.015224359463900328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.015224359463900328, "entropy": 1.0927535817027092, "epoch": 0.00972, "grad_norm": 0.05398321896791458, "kl": 0.6149727180600166, "learning_rate": 9.999898653960649e-06, "loss": -0.0133, "step": 972, "step_time": 5.171385947001909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 533.3125, "completions/mean_terminated_length": 533.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.8459103293716908, "epoch": 0.00973, "frac_reward_zero_std": 0.25, "grad_norm": 0.24535958468914032, "kl": 1.6162548251450062, "learning_rate": 9.999898437294581e-06, "loss": -0.0242, "num_tokens": 22705733.0, "reward": 1.6849610805511475, "reward_std": 2.3471336364746094, "rewards/rollout_reward_func/mean": 1.6849610805511475, "rewards/rollout_reward_func/std": 3.5448341369628906, "sampling/importance_sampling_ratio/max": 0.9737962484359741, "sampling/importance_sampling_ratio/mean": 0.5412124395370483, "sampling/importance_sampling_ratio/min": 0.0012210222193971276, "sampling/sampling_logp_difference/max": 2.195463180541992, "sampling/sampling_logp_difference/mean": 0.24870017170906067, "step": 973, "step_time": 11.161791139002162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.853357370942831, "epoch": 0.00974, "grad_norm": 0.23032785952091217, "kl": 1.506647266447544, "learning_rate": 9.99989822039716e-06, "loss": -0.0253, "step": 974, "step_time": 6.1258974419997685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1855.0, "completions/max_terminated_length": 1855.0, "completions/mean_length": 798.65625, "completions/mean_terminated_length": 798.65625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.2721164226531982, "epoch": 0.00975, "frac_reward_zero_std": 0.0, "grad_norm": 0.3521071970462799, "kl": 1.5423980951309204, "learning_rate": 9.999898003268387e-06, "loss": -0.053, "num_tokens": 22769917.0, "reward": 1.4125183820724487, "reward_std": 3.3644793033599854, "rewards/rollout_reward_func/mean": 1.4125183820724487, "rewards/rollout_reward_func/std": 3.3725147247314453, "sampling/importance_sampling_ratio/max": 0.9891765117645264, "sampling/importance_sampling_ratio/mean": 0.37409543991088867, "sampling/importance_sampling_ratio/min": 0.0007682287832722068, "sampling/sampling_logp_difference/max": 2.4642527103424072, "sampling/sampling_logp_difference/mean": 0.2951534390449524, "step": 975, "step_time": 13.719720798002527 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01105769257992506, "entropy": 2.2441540956497192, "epoch": 0.00976, "grad_norm": 0.24144750833511353, "kl": 1.5487038493156433, "learning_rate": 9.99989778590826e-06, "loss": -0.054, "step": 976, "step_time": 7.789680859999862 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 410.75, "completions/mean_terminated_length": 410.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1310129016637802, "epoch": 0.00977, "frac_reward_zero_std": 0.0, "grad_norm": 0.36070340871810913, "kl": 1.234673522412777, "learning_rate": 9.99989756831678e-06, "loss": -0.0087, "num_tokens": 22821286.0, "reward": 3.1703739166259766, "reward_std": 1.5317487716674805, "rewards/rollout_reward_func/mean": 3.1703739166259766, "rewards/rollout_reward_func/std": 2.9470627307891846, "sampling/importance_sampling_ratio/max": 0.9486209154129028, "sampling/importance_sampling_ratio/mean": 0.6622523069381714, "sampling/importance_sampling_ratio/min": 1.2453385067744828e-20, "sampling/sampling_logp_difference/max": 14.970890045166016, "sampling/sampling_logp_difference/mean": 0.4913831949234009, "step": 977, "step_time": 9.939107830001376 }, { "clip_ratio/high_max": 0.041038007009774446, "clip_ratio/high_mean": 0.026769003132358193, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.033019002759829164, "entropy": 1.1440407149493694, "epoch": 0.00978, "grad_norm": 0.2310323417186737, "kl": 1.2437446936964989, "learning_rate": 9.999897350493949e-06, "loss": -0.009, "step": 978, "step_time": 5.415251866999824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004315476398915052, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004315476398915052, "completions/clipped_ratio": 0.03125, "completions/max_length": 1860.0, "completions/max_terminated_length": 1860.0, "completions/mean_length": 669.3125, "completions/mean_terminated_length": 671.3225708007812, "completions/min_length": 286.0, "completions/min_terminated_length": 286.0, "entropy": 1.576929997652769, "epoch": 0.00979, "frac_reward_zero_std": 0.0, "grad_norm": 0.1865396797657013, "kl": 0.9906807094812393, "learning_rate": 9.999897132439763e-06, "loss": -0.0268, "num_tokens": 22880801.0, "reward": 3.3094327449798584, "reward_std": 2.182387351989746, "rewards/rollout_reward_func/mean": 3.3094327449798584, "rewards/rollout_reward_func/std": 2.654918909072876, "sampling/importance_sampling_ratio/max": 0.974922239780426, "sampling/importance_sampling_ratio/mean": 0.5019469857215881, "sampling/importance_sampling_ratio/min": 1.789115233463599e-17, "sampling/sampling_logp_difference/max": 16.261892318725586, "sampling/sampling_logp_difference/mean": 0.34983158111572266, "step": 979, "step_time": 13.477930850996927 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.007843137485906482, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013051470974460244, "entropy": 1.545237634330988, "epoch": 0.0098, "grad_norm": 0.16611747443675995, "kl": 0.974461130797863, "learning_rate": 9.999896914154225e-06, "loss": -0.0275, "step": 980, "step_time": 7.205659944003855 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1396.0, "completions/max_terminated_length": 1396.0, "completions/mean_length": 433.0625, "completions/mean_terminated_length": 433.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.879557877779007, "epoch": 0.00981, "frac_reward_zero_std": 0.25, "grad_norm": 0.23453566431999207, "kl": 1.0761270597577095, "learning_rate": 9.999896695637334e-06, "loss": -0.0269, "num_tokens": 22931704.0, "reward": 2.0530433654785156, "reward_std": 2.4314157962799072, "rewards/rollout_reward_func/mean": 2.0530433654785156, "rewards/rollout_reward_func/std": 3.3862037658691406, "sampling/importance_sampling_ratio/max": 0.9966909289360046, "sampling/importance_sampling_ratio/mean": 0.5371524095535278, "sampling/importance_sampling_ratio/min": 1.802805443178266e-15, "sampling/sampling_logp_difference/max": 18.526357650756836, "sampling/sampling_logp_difference/mean": 0.3925587832927704, "step": 981, "step_time": 11.688039868999113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.8546783700585365, "epoch": 0.00982, "grad_norm": 0.20278117060661316, "kl": 1.0501809753477573, "learning_rate": 9.999896476889088e-06, "loss": -0.0276, "step": 982, "step_time": 6.242557694002244 }, { "clip_ratio/high_max": 0.0059523810632526875, "clip_ratio/high_mean": 0.0029761905316263437, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0029761905316263437, "completions/clipped_ratio": 0.03125, "completions/max_length": 800.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 300.21875, "completions/mean_terminated_length": 309.3870849609375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.466778725385666, "epoch": 0.00983, "frac_reward_zero_std": 0.0, "grad_norm": 0.14789412915706635, "kl": 1.2969720736145973, "learning_rate": 9.999896257909492e-06, "loss": -0.0437, "num_tokens": 22979328.0, "reward": 2.036370277404785, "reward_std": 1.7837071418762207, "rewards/rollout_reward_func/mean": 2.036370277404785, "rewards/rollout_reward_func/std": 3.2488832473754883, "sampling/importance_sampling_ratio/max": 0.9684182405471802, "sampling/importance_sampling_ratio/mean": 0.6242291927337646, "sampling/importance_sampling_ratio/min": 0.0001338013680651784, "sampling/sampling_logp_difference/max": 2.286747694015503, "sampling/sampling_logp_difference/mean": 0.21406161785125732, "step": 983, "step_time": 8.967888735000088 }, { "clip_ratio/high_max": 0.04681776603683829, "clip_ratio/high_mean": 0.023408883018419147, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027575549902394414, "entropy": 1.4186907857656479, "epoch": 0.00984, "grad_norm": 0.1272258311510086, "kl": 1.262532364577055, "learning_rate": 9.999896038698543e-06, "loss": -0.0439, "step": 984, "step_time": 5.427049975995033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 714.5625, "completions/mean_terminated_length": 714.5625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5867650359869003, "epoch": 0.00985, "frac_reward_zero_std": 0.0, "grad_norm": 0.2004929929971695, "kl": 1.3771099150180817, "learning_rate": 9.999895819256238e-06, "loss": -0.0724, "num_tokens": 23041423.0, "reward": 1.3965332508087158, "reward_std": 3.2267022132873535, "rewards/rollout_reward_func/mean": 1.3965332508087158, "rewards/rollout_reward_func/std": 3.3486223220825195, "sampling/importance_sampling_ratio/max": 1.0629206895828247, "sampling/importance_sampling_ratio/mean": 0.5916879177093506, "sampling/importance_sampling_ratio/min": 0.007573488634079695, "sampling/sampling_logp_difference/max": 2.1056156158447266, "sampling/sampling_logp_difference/mean": 0.1684565246105194, "step": 985, "step_time": 12.184546938995481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004464285913854837, "entropy": 1.5604513138532639, "epoch": 0.00986, "grad_norm": 0.1985146701335907, "kl": 1.3756827861070633, "learning_rate": 9.99989559958258e-06, "loss": -0.0727, "step": 986, "step_time": 6.339055661997918 }, { "clip_ratio/high_max": 0.01785714365541935, "clip_ratio/high_mean": 0.008928571827709675, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008928571827709675, "completions/clipped_ratio": 0.0, "completions/max_length": 1219.0, "completions/max_terminated_length": 1219.0, "completions/mean_length": 427.6875, "completions/mean_terminated_length": 427.6875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.188322365283966, "epoch": 0.00987, "frac_reward_zero_std": 0.0, "grad_norm": 0.29801276326179504, "kl": 1.7043517082929611, "learning_rate": 9.999895379677572e-06, "loss": -0.0496, "num_tokens": 23093796.0, "reward": 0.1763962209224701, "reward_std": 3.164689064025879, "rewards/rollout_reward_func/mean": 0.1763962209224701, "rewards/rollout_reward_func/std": 3.214573621749878, "sampling/importance_sampling_ratio/max": 0.9627614617347717, "sampling/importance_sampling_ratio/mean": 0.4764752984046936, "sampling/importance_sampling_ratio/min": 1.2528738119621111e-17, "sampling/sampling_logp_difference/max": 12.209505081176758, "sampling/sampling_logp_difference/mean": 0.4353058934211731, "step": 987, "step_time": 10.43669796499853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014062500093132257, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 2.1747878789901733, "epoch": 0.00988, "grad_norm": 0.31949084997177124, "kl": 1.6943484619259834, "learning_rate": 9.99989515954121e-06, "loss": -0.0507, "step": 988, "step_time": 6.446374389999619 }, { "clip_ratio/high_max": 0.015277778264135122, "clip_ratio/high_mean": 0.007638889132067561, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007638889132067561, "completions/clipped_ratio": 0.0, "completions/max_length": 1672.0, "completions/max_terminated_length": 1672.0, "completions/mean_length": 485.28125, "completions/mean_terminated_length": 485.28125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1350589767098427, "epoch": 0.00989, "frac_reward_zero_std": 0.25, "grad_norm": 0.24347515404224396, "kl": 0.8167408406734467, "learning_rate": 9.999894939173495e-06, "loss": -0.0211, "num_tokens": 23145712.0, "reward": 2.6168453693389893, "reward_std": 2.6275932788848877, "rewards/rollout_reward_func/mean": 2.6168453693389893, "rewards/rollout_reward_func/std": 3.338613271713257, "sampling/importance_sampling_ratio/max": 0.9754573702812195, "sampling/importance_sampling_ratio/mean": 0.6544622778892517, "sampling/importance_sampling_ratio/min": 0.009432174265384674, "sampling/sampling_logp_difference/max": 2.0480902194976807, "sampling/sampling_logp_difference/mean": 0.13072723150253296, "step": 989, "step_time": 12.86204227300368 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013095238711684942, "entropy": 1.1421939730644226, "epoch": 0.0099, "grad_norm": 0.14441929757595062, "kl": 0.8122344613075256, "learning_rate": 9.999894718574428e-06, "loss": -0.0214, "step": 990, "step_time": 6.926523004998671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.03125, "completions/max_length": 1232.0, "completions/max_terminated_length": 1232.0, "completions/mean_length": 571.65625, "completions/mean_terminated_length": 573.741943359375, "completions/min_length": 276.0, "completions/min_terminated_length": 276.0, "entropy": 1.8836051598191261, "epoch": 0.00991, "frac_reward_zero_std": 0.0, "grad_norm": 0.26686426997184753, "kl": 1.2570402659475803, "learning_rate": 9.999894497744008e-06, "loss": -0.0254, "num_tokens": 23203483.0, "reward": 2.5401716232299805, "reward_std": 3.0411412715911865, "rewards/rollout_reward_func/mean": 2.5401716232299805, "rewards/rollout_reward_func/std": 3.122842788696289, "sampling/importance_sampling_ratio/max": 0.9483185410499573, "sampling/importance_sampling_ratio/mean": 0.500088095664978, "sampling/importance_sampling_ratio/min": 3.433673701272455e-12, "sampling/sampling_logp_difference/max": 18.663726806640625, "sampling/sampling_logp_difference/mean": 0.4005594253540039, "step": 991, "step_time": 10.766450464998343 }, { "clip_ratio/high_max": 0.02083333395421505, "clip_ratio/high_mean": 0.010416666977107525, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013134058332070708, "entropy": 1.887473851442337, "epoch": 0.00992, "grad_norm": 0.2326996624469757, "kl": 1.2487915568053722, "learning_rate": 9.999894276682234e-06, "loss": -0.026, "step": 992, "step_time": 6.488234280999677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005831643007695675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005831643007695675, "completions/clipped_ratio": 0.0, "completions/max_length": 1346.0, "completions/max_terminated_length": 1346.0, "completions/mean_length": 417.75, "completions/mean_terminated_length": 417.75, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.3847399540245533, "epoch": 0.00993, "frac_reward_zero_std": 0.25, "grad_norm": 0.33722561597824097, "kl": 1.077914010733366, "learning_rate": 9.999894055389108e-06, "loss": -0.0379, "num_tokens": 23254529.0, "reward": 3.414957046508789, "reward_std": 1.5464951992034912, "rewards/rollout_reward_func/mean": 3.414957046508789, "rewards/rollout_reward_func/std": 2.4555935859680176, "sampling/importance_sampling_ratio/max": 0.9803106188774109, "sampling/importance_sampling_ratio/mean": 0.6708482503890991, "sampling/importance_sampling_ratio/min": 3.08627716482171e-13, "sampling/sampling_logp_difference/max": 14.174957275390625, "sampling/sampling_logp_difference/mean": 0.38461562991142273, "step": 993, "step_time": 10.986957050998171 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.007986815413460135, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019918633857741952, "entropy": 1.3788106366991997, "epoch": 0.00994, "grad_norm": 0.19867442548274994, "kl": 1.0619581639766693, "learning_rate": 9.999893833864629e-06, "loss": -0.0387, "step": 994, "step_time": 6.63026814099976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1718.0, "completions/max_terminated_length": 1718.0, "completions/mean_length": 573.40625, "completions/mean_terminated_length": 573.40625, "completions/min_length": 262.0, "completions/min_terminated_length": 262.0, "entropy": 1.2136247083544731, "epoch": 0.00995, "frac_reward_zero_std": 0.25, "grad_norm": 0.10722152888774872, "kl": 1.117992326617241, "learning_rate": 9.999893612108798e-06, "loss": -0.0285, "num_tokens": 23312279.0, "reward": 3.8319592475891113, "reward_std": 1.6386070251464844, "rewards/rollout_reward_func/mean": 3.8319592475891113, "rewards/rollout_reward_func/std": 2.673532485961914, "sampling/importance_sampling_ratio/max": 1.0188289880752563, "sampling/importance_sampling_ratio/mean": 0.6506704688072205, "sampling/importance_sampling_ratio/min": 0.0042777396738529205, "sampling/sampling_logp_difference/max": 2.0347793102264404, "sampling/sampling_logp_difference/mean": 0.14292477071285248, "step": 995, "step_time": 12.516301502002534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 1.2255747467279434, "epoch": 0.00996, "grad_norm": 0.12242919206619263, "kl": 1.1198020353913307, "learning_rate": 9.999893390121612e-06, "loss": -0.0281, "step": 996, "step_time": 6.848547035000593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "completions/clipped_ratio": 0.0625, "completions/max_length": 1157.0, "completions/max_terminated_length": 1157.0, "completions/mean_length": 444.28125, "completions/mean_terminated_length": 433.5000305175781, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.5275696218013763, "epoch": 0.00997, "frac_reward_zero_std": 0.25, "grad_norm": 0.27591586112976074, "kl": 0.8288926593959332, "learning_rate": 9.999893167903075e-06, "loss": -0.044, "num_tokens": 23362785.0, "reward": 1.8101319074630737, "reward_std": 2.4151556491851807, "rewards/rollout_reward_func/mean": 1.8101319074630737, "rewards/rollout_reward_func/std": 3.2576045989990234, "sampling/importance_sampling_ratio/max": 0.9756661057472229, "sampling/importance_sampling_ratio/mean": 0.5884767770767212, "sampling/importance_sampling_ratio/min": 0.00017547431343700737, "sampling/sampling_logp_difference/max": 2.030649185180664, "sampling/sampling_logp_difference/mean": 0.2189461886882782, "step": 997, "step_time": 11.43516998999985 }, { "clip_ratio/high_max": 0.004999999888241291, "clip_ratio/high_mean": 0.0024999999441206455, "clip_ratio/low_mean": 0.004464285913854837, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.006964285857975483, "entropy": 1.5445934757590294, "epoch": 0.00998, "grad_norm": 0.20881015062332153, "kl": 0.8473883159458637, "learning_rate": 9.999892945453185e-06, "loss": -0.0449, "step": 998, "step_time": 6.144475315002637 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.0, "completions/max_length": 1209.0, "completions/max_terminated_length": 1209.0, "completions/mean_length": 325.0625, "completions/mean_terminated_length": 325.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6664648503065109, "epoch": 0.00999, "frac_reward_zero_std": 0.5, "grad_norm": 0.227505624294281, "kl": 0.7020930843427777, "learning_rate": 9.999892722771942e-06, "loss": -0.0148, "num_tokens": 23406860.0, "reward": 3.796489715576172, "reward_std": 0.9355168342590332, "rewards/rollout_reward_func/mean": 3.796489715576172, "rewards/rollout_reward_func/std": 1.932724118232727, "sampling/importance_sampling_ratio/max": 0.9885759949684143, "sampling/importance_sampling_ratio/mean": 0.7862951755523682, "sampling/importance_sampling_ratio/min": 2.4160856725941125e-15, "sampling/sampling_logp_difference/max": 14.871663093566895, "sampling/sampling_logp_difference/mean": 0.22015725076198578, "step": 999, "step_time": 10.635951459997159 }, { "clip_ratio/high_max": 0.004807692486792803, "clip_ratio/high_mean": 0.0024038462433964014, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "entropy": 0.6491342633962631, "epoch": 0.01, "grad_norm": 0.1916346251964569, "kl": 0.6991082467138767, "learning_rate": 9.999892499859347e-06, "loss": -0.015, "step": 1000, "step_time": 5.903614542996365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1264.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 306.59375, "completions/mean_terminated_length": 306.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9593273922801018, "epoch": 0.01001, "frac_reward_zero_std": 0.5, "grad_norm": 0.28653258085250854, "kl": 0.8902090666815639, "learning_rate": 9.999892276715398e-06, "loss": -0.0101, "num_tokens": 23450300.0, "reward": 2.389892578125, "reward_std": 1.6559950113296509, "rewards/rollout_reward_func/mean": 2.389892578125, "rewards/rollout_reward_func/std": 2.830012321472168, "sampling/importance_sampling_ratio/max": 0.9765813946723938, "sampling/importance_sampling_ratio/mean": 0.7610787749290466, "sampling/importance_sampling_ratio/min": 0.002164725214242935, "sampling/sampling_logp_difference/max": 1.6795601844787598, "sampling/sampling_logp_difference/mean": 0.13824227452278137, "step": 1001, "step_time": 10.906029325002237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9586819838732481, "epoch": 0.01002, "grad_norm": 0.2670210301876068, "kl": 0.8986543100327253, "learning_rate": 9.9998920533401e-06, "loss": -0.0103, "step": 1002, "step_time": 6.410187536999729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1095.0, "completions/max_terminated_length": 1095.0, "completions/mean_length": 351.0625, "completions/mean_terminated_length": 351.0625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8161295764148235, "epoch": 0.01003, "frac_reward_zero_std": 0.25, "grad_norm": 0.3257586658000946, "kl": 0.766305822879076, "learning_rate": 9.999891829733444e-06, "loss": 0.0005, "num_tokens": 23498554.0, "reward": 3.190845012664795, "reward_std": 1.7719683647155762, "rewards/rollout_reward_func/mean": 3.190845012664795, "rewards/rollout_reward_func/std": 2.6262423992156982, "sampling/importance_sampling_ratio/max": 0.9774537086486816, "sampling/importance_sampling_ratio/mean": 0.7579287886619568, "sampling/importance_sampling_ratio/min": 0.039349012076854706, "sampling/sampling_logp_difference/max": 1.4032065868377686, "sampling/sampling_logp_difference/mean": 0.09004464000463486, "step": 1003, "step_time": 10.172957489998225 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0034722222480922937, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009154040599241853, "entropy": 0.8334408886730671, "epoch": 0.01004, "grad_norm": 0.361068993806839, "kl": 0.7627491373568773, "learning_rate": 9.999891605895438e-06, "loss": 0.0008, "step": 1004, "step_time": 5.609700169999996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1158.0, "completions/max_terminated_length": 1158.0, "completions/mean_length": 476.21875, "completions/mean_terminated_length": 476.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.6012153327465057, "epoch": 0.01005, "frac_reward_zero_std": 0.0, "grad_norm": 0.33839839696884155, "kl": 1.2257004901766777, "learning_rate": 9.999891381826079e-06, "loss": -0.0489, "num_tokens": 23553251.0, "reward": 0.7445789575576782, "reward_std": 2.8521299362182617, "rewards/rollout_reward_func/mean": 0.7445789575576782, "rewards/rollout_reward_func/std": 3.749737501144409, "sampling/importance_sampling_ratio/max": 0.9838196635246277, "sampling/importance_sampling_ratio/mean": 0.5757448077201843, "sampling/importance_sampling_ratio/min": 3.022434909335263e-17, "sampling/sampling_logp_difference/max": 13.763121604919434, "sampling/sampling_logp_difference/mean": 0.3895391821861267, "step": 1005, "step_time": 11.235779480000929 }, { "clip_ratio/high_max": 0.020979021675884724, "clip_ratio/high_mean": 0.010489510837942362, "clip_ratio/low_mean": 0.015625, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.026114510837942362, "entropy": 1.5907497555017471, "epoch": 0.01006, "grad_norm": 0.25680240988731384, "kl": 1.2314657792448997, "learning_rate": 9.999891157525368e-06, "loss": -0.0496, "step": 1006, "step_time": 5.822939054001836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 352.4375, "completions/mean_terminated_length": 352.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1978452987968922, "epoch": 0.01007, "frac_reward_zero_std": 0.25, "grad_norm": 0.16442003846168518, "kl": 1.1839209608733654, "learning_rate": 9.999890932993303e-06, "loss": -0.0705, "num_tokens": 23600865.0, "reward": 3.0190446376800537, "reward_std": 2.598637104034424, "rewards/rollout_reward_func/mean": 3.0190446376800537, "rewards/rollout_reward_func/std": 2.94878888130188, "sampling/importance_sampling_ratio/max": 0.9834364652633667, "sampling/importance_sampling_ratio/mean": 0.7184199094772339, "sampling/importance_sampling_ratio/min": 0.00018671857833396643, "sampling/sampling_logp_difference/max": 2.2724802494049072, "sampling/sampling_logp_difference/mean": 0.17779794335365295, "step": 1007, "step_time": 10.165069262000543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 1.1916446201503277, "epoch": 0.01008, "grad_norm": 0.16061364114284515, "kl": 1.1903995350003242, "learning_rate": 9.999890708229887e-06, "loss": -0.0709, "step": 1008, "step_time": 5.252936355000202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 318.59375, "completions/mean_terminated_length": 318.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.7508646175265312, "epoch": 0.01009, "frac_reward_zero_std": 0.25, "grad_norm": 0.33480551838874817, "kl": 0.7970219366252422, "learning_rate": 9.999890483235117e-06, "loss": -0.0436, "num_tokens": 23647843.0, "reward": 2.3158187866210938, "reward_std": 2.3804328441619873, "rewards/rollout_reward_func/mean": 2.3158187866210938, "rewards/rollout_reward_func/std": 3.079951524734497, "sampling/importance_sampling_ratio/max": 0.9785471558570862, "sampling/importance_sampling_ratio/mean": 0.7919056415557861, "sampling/importance_sampling_ratio/min": 0.0270681232213974, "sampling/sampling_logp_difference/max": 1.814777135848999, "sampling/sampling_logp_difference/mean": 0.07960796356201172, "step": 1009, "step_time": 10.105079032999129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 0.729167265817523, "epoch": 0.0101, "grad_norm": 0.2617017924785614, "kl": 0.8005871446803212, "learning_rate": 9.999890258008995e-06, "loss": -0.0446, "step": 1010, "step_time": 5.21309568200013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1085.0, "completions/max_terminated_length": 1085.0, "completions/mean_length": 194.15625, "completions/mean_terminated_length": 194.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6756561025977135, "epoch": 0.01011, "frac_reward_zero_std": 0.5, "grad_norm": 1.0460118055343628, "kl": 2.2330455696210265, "learning_rate": 9.99989003255152e-06, "loss": -0.0001, "num_tokens": 23685396.0, "reward": 3.5132346153259277, "reward_std": 1.0255390405654907, "rewards/rollout_reward_func/mean": 3.5132346153259277, "rewards/rollout_reward_func/std": 1.9160940647125244, "sampling/importance_sampling_ratio/max": 0.9776198863983154, "sampling/importance_sampling_ratio/mean": 0.8052816390991211, "sampling/importance_sampling_ratio/min": 0.05678309127688408, "sampling/sampling_logp_difference/max": 2.0709152221679688, "sampling/sampling_logp_difference/mean": 0.08083131164312363, "step": 1011, "step_time": 10.17624080999849 }, { "clip_ratio/high_max": 0.02777777798473835, "clip_ratio/high_mean": 0.013888888992369175, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "entropy": 0.6644952110946178, "epoch": 0.01012, "grad_norm": 0.23584774136543274, "kl": 0.9705953933298588, "learning_rate": 9.999889806862695e-06, "loss": -0.0041, "step": 1012, "step_time": 5.526366492995294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 824.0, "completions/max_terminated_length": 824.0, "completions/mean_length": 87.71875, "completions/mean_terminated_length": 87.71875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.868091806769371, "epoch": 0.01013, "frac_reward_zero_std": 0.5, "grad_norm": 0.14845667779445648, "kl": 0.805386995896697, "learning_rate": 9.999889580942514e-06, "loss": -0.0278, "num_tokens": 23723041.0, "reward": -0.051798999309539795, "reward_std": 1.4910061359405518, "rewards/rollout_reward_func/mean": -0.051798999309539795, "rewards/rollout_reward_func/std": 3.1914896965026855, "sampling/importance_sampling_ratio/max": 1.0189601182937622, "sampling/importance_sampling_ratio/mean": 0.7927311658859253, "sampling/importance_sampling_ratio/min": 0.026444997638463974, "sampling/sampling_logp_difference/max": 1.9359772205352783, "sampling/sampling_logp_difference/mean": 0.1065930724143982, "step": 1013, "step_time": 8.888398814002358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8669917620718479, "epoch": 0.01014, "grad_norm": 0.12642918527126312, "kl": 0.8123425971716642, "learning_rate": 9.999889354790982e-06, "loss": -0.0281, "step": 1014, "step_time": 4.614050103999034 }, { "clip_ratio/high_max": 0.015165441203862429, "clip_ratio/high_mean": 0.007582720601931214, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.012791054090484977, "completions/clipped_ratio": 0.0, "completions/max_length": 1318.0, "completions/max_terminated_length": 1318.0, "completions/mean_length": 667.46875, "completions/mean_terminated_length": 667.46875, "completions/min_length": 365.0, "completions/min_terminated_length": 365.0, "entropy": 2.0311503559350967, "epoch": 0.01015, "frac_reward_zero_std": 0.0, "grad_norm": 0.21485547721385956, "kl": 1.1540638655424118, "learning_rate": 9.999889128408096e-06, "loss": -0.0423, "num_tokens": 23782910.0, "reward": -0.07350742816925049, "reward_std": 3.0800628662109375, "rewards/rollout_reward_func/mean": -0.07350742816925049, "rewards/rollout_reward_func/std": 3.2691657543182373, "sampling/importance_sampling_ratio/max": 1.0435786247253418, "sampling/importance_sampling_ratio/mean": 0.4201241731643677, "sampling/importance_sampling_ratio/min": 0.0028953857254236937, "sampling/sampling_logp_difference/max": 2.4113285541534424, "sampling/sampling_logp_difference/mean": 0.25337374210357666, "step": 1015, "step_time": 11.98162489599963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.014182692859321833, "clip_ratio/low_min": 0.008333333767950535, "clip_ratio/region_mean": 0.014182692859321833, "entropy": 2.0284203737974167, "epoch": 0.01016, "grad_norm": 0.2294924408197403, "kl": 1.1494074761867523, "learning_rate": 9.99988890179386e-06, "loss": -0.0425, "step": 1016, "step_time": 6.18197082700317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 333.71875, "completions/mean_terminated_length": 333.71875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4110875204205513, "epoch": 0.01017, "frac_reward_zero_std": 0.25, "grad_norm": 0.26768264174461365, "kl": 0.9329747129231691, "learning_rate": 9.999888674948271e-06, "loss": -0.0306, "num_tokens": 23827672.0, "reward": 1.15324068069458, "reward_std": 1.1064879894256592, "rewards/rollout_reward_func/mean": 1.15324068069458, "rewards/rollout_reward_func/std": 3.895979642868042, "sampling/importance_sampling_ratio/max": 0.9750638604164124, "sampling/importance_sampling_ratio/mean": 0.6512023210525513, "sampling/importance_sampling_ratio/min": 0.035913798958063126, "sampling/sampling_logp_difference/max": 1.5064884424209595, "sampling/sampling_logp_difference/mean": 0.1487424522638321, "step": 1017, "step_time": 12.152939238996623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.409194990992546, "epoch": 0.01018, "grad_norm": 0.3776502013206482, "kl": 0.9364800676703453, "learning_rate": 9.999888447871328e-06, "loss": -0.0313, "step": 1018, "step_time": 6.964846082000804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1143.0, "completions/max_terminated_length": 1143.0, "completions/mean_length": 286.0, "completions/mean_terminated_length": 286.0, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.1073353663086891, "epoch": 0.01019, "frac_reward_zero_std": 0.25, "grad_norm": 0.30629971623420715, "kl": 1.2662716656923294, "learning_rate": 9.999888220563034e-06, "loss": -0.0367, "num_tokens": 23872223.0, "reward": 1.8528902530670166, "reward_std": 2.629340648651123, "rewards/rollout_reward_func/mean": 1.8528902530670166, "rewards/rollout_reward_func/std": 3.4224321842193604, "sampling/importance_sampling_ratio/max": 0.9751163721084595, "sampling/importance_sampling_ratio/mean": 0.7009672522544861, "sampling/importance_sampling_ratio/min": 0.14247529208660126, "sampling/sampling_logp_difference/max": 1.389870285987854, "sampling/sampling_logp_difference/mean": 0.10838864743709564, "step": 1019, "step_time": 10.432121109000946 }, { "clip_ratio/high_max": 0.036706349812448025, "clip_ratio/high_mean": 0.018353174906224012, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018353174906224012, "entropy": 1.1055239737033844, "epoch": 0.0102, "grad_norm": 0.26006370782852173, "kl": 1.2485650982707739, "learning_rate": 9.999887993023387e-06, "loss": -0.0369, "step": 1020, "step_time": 5.716221028000291 }, { "clip_ratio/high_max": 0.02222222276031971, "clip_ratio/high_mean": 0.011111111380159855, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011111111380159855, "completions/clipped_ratio": 0.0, "completions/max_length": 1648.0, "completions/max_terminated_length": 1648.0, "completions/mean_length": 679.5625, "completions/mean_terminated_length": 679.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2571759596467018, "epoch": 0.01021, "frac_reward_zero_std": 0.0, "grad_norm": 0.2068272829055786, "kl": 0.9226078167557716, "learning_rate": 9.999887765252387e-06, "loss": -0.0633, "num_tokens": 23933188.0, "reward": 3.267409324645996, "reward_std": 2.8358142375946045, "rewards/rollout_reward_func/mean": 3.267409324645996, "rewards/rollout_reward_func/std": 3.217376708984375, "sampling/importance_sampling_ratio/max": 1.046520709991455, "sampling/importance_sampling_ratio/mean": 0.6387474536895752, "sampling/importance_sampling_ratio/min": 4.636431085396741e-19, "sampling/sampling_logp_difference/max": 15.564445495605469, "sampling/sampling_logp_difference/mean": 0.34694844484329224, "step": 1021, "step_time": 12.788747620001232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.007352941203862429, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007352941203862429, "entropy": 1.2699333764612675, "epoch": 0.01022, "grad_norm": 0.2676071524620056, "kl": 0.9196005500853062, "learning_rate": 9.999887537250035e-06, "loss": -0.0632, "step": 1022, "step_time": 7.498877786001685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1588.0, "completions/max_terminated_length": 1588.0, "completions/mean_length": 294.9375, "completions/mean_terminated_length": 294.9375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9529300183057785, "epoch": 0.01023, "frac_reward_zero_std": 0.25, "grad_norm": 0.24104556441307068, "kl": 0.8272740188986063, "learning_rate": 9.999887309016332e-06, "loss": -0.0215, "num_tokens": 23978889.0, "reward": 2.570833683013916, "reward_std": 2.040647029876709, "rewards/rollout_reward_func/mean": 2.570833683013916, "rewards/rollout_reward_func/std": 3.2393696308135986, "sampling/importance_sampling_ratio/max": 1.0602370500564575, "sampling/importance_sampling_ratio/mean": 0.7690160274505615, "sampling/importance_sampling_ratio/min": 0.08652611821889877, "sampling/sampling_logp_difference/max": 1.6557221412658691, "sampling/sampling_logp_difference/mean": 0.10950852930545807, "step": 1023, "step_time": 11.730284647001099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.963543489575386, "epoch": 0.01024, "grad_norm": 0.23709169030189514, "kl": 0.8244979735463858, "learning_rate": 9.999887080551273e-06, "loss": -0.022, "step": 1024, "step_time": 7.190428547002739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1865.0, "completions/max_terminated_length": 1865.0, "completions/mean_length": 745.0, "completions/mean_terminated_length": 745.0, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "entropy": 1.2982934638857841, "epoch": 0.01025, "frac_reward_zero_std": 0.0, "grad_norm": 0.42496052384376526, "kl": 1.1015103980898857, "learning_rate": 9.999886851854866e-06, "loss": -0.0875, "num_tokens": 24041026.0, "reward": 3.5555520057678223, "reward_std": 1.8617274761199951, "rewards/rollout_reward_func/mean": 3.5555520057678223, "rewards/rollout_reward_func/std": 2.8329715728759766, "sampling/importance_sampling_ratio/max": 0.9963878989219666, "sampling/importance_sampling_ratio/mean": 0.6215764284133911, "sampling/importance_sampling_ratio/min": 2.4910065329897257e-12, "sampling/sampling_logp_difference/max": 13.611894607543945, "sampling/sampling_logp_difference/mean": 0.2677571177482605, "step": 1025, "step_time": 12.932379691001188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "entropy": 1.3113617077469826, "epoch": 0.01026, "grad_norm": 0.4121761918067932, "kl": 1.1012832298874855, "learning_rate": 9.999886622927104e-06, "loss": -0.0881, "step": 1026, "step_time": 7.709383578003326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1604.0, "completions/max_terminated_length": 1604.0, "completions/mean_length": 477.09375, "completions/mean_terminated_length": 477.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2899527177214622, "epoch": 0.01027, "frac_reward_zero_std": 0.25, "grad_norm": 0.2188386768102646, "kl": 0.9043027758598328, "learning_rate": 9.99988639376799e-06, "loss": -0.0484, "num_tokens": 24091155.0, "reward": 2.980287551879883, "reward_std": 2.3823976516723633, "rewards/rollout_reward_func/mean": 2.980287551879883, "rewards/rollout_reward_func/std": 2.821322441101074, "sampling/importance_sampling_ratio/max": 1.063557505607605, "sampling/importance_sampling_ratio/mean": 0.68475741147995, "sampling/importance_sampling_ratio/min": 0.0035591425839811563, "sampling/sampling_logp_difference/max": 2.4643094539642334, "sampling/sampling_logp_difference/mean": 0.16993433237075806, "step": 1027, "step_time": 11.923289544 }, { "clip_ratio/high_max": 0.011363636702299118, "clip_ratio/high_mean": 0.005681818351149559, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 1.2664064019918442, "epoch": 0.01028, "grad_norm": 0.20970459282398224, "kl": 0.9011700041592121, "learning_rate": 9.999886164377523e-06, "loss": -0.049, "step": 1028, "step_time": 7.194670345999839 }, { "clip_ratio/high_max": 0.0052083334885537624, "clip_ratio/high_mean": 0.0026041667442768812, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854166837409139, "completions/clipped_ratio": 0.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 344.3125, "completions/mean_terminated_length": 344.3125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.219521850347519, "epoch": 0.01029, "frac_reward_zero_std": 0.0, "grad_norm": 0.3874674141407013, "kl": 0.875544648617506, "learning_rate": 9.999885934755706e-06, "loss": -0.0406, "num_tokens": 24138700.0, "reward": 1.749879240989685, "reward_std": 2.5579700469970703, "rewards/rollout_reward_func/mean": 1.749879240989685, "rewards/rollout_reward_func/std": 3.1580471992492676, "sampling/importance_sampling_ratio/max": 0.9776600003242493, "sampling/importance_sampling_ratio/mean": 0.7038885354995728, "sampling/importance_sampling_ratio/min": 1.2208433934105878e-17, "sampling/sampling_logp_difference/max": 15.565013885498047, "sampling/sampling_logp_difference/mean": 0.3469289541244507, "step": 1029, "step_time": 9.662659318000806 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 1.206022646278143, "epoch": 0.0103, "grad_norm": 0.31605684757232666, "kl": 0.8528986442834139, "learning_rate": 9.999885704902536e-06, "loss": -0.0419, "step": 1030, "step_time": 5.7558690259957075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.0, "completions/max_length": 843.0, "completions/max_terminated_length": 843.0, "completions/mean_length": 434.21875, "completions/mean_terminated_length": 434.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8167150132358074, "epoch": 0.01031, "frac_reward_zero_std": 0.0, "grad_norm": 0.22558864951133728, "kl": 0.7900378946214914, "learning_rate": 9.999885474818013e-06, "loss": -0.0647, "num_tokens": 24191873.0, "reward": 3.0339584350585938, "reward_std": 2.92985463142395, "rewards/rollout_reward_func/mean": 3.0339584350585938, "rewards/rollout_reward_func/std": 3.0004570484161377, "sampling/importance_sampling_ratio/max": 0.9737061262130737, "sampling/importance_sampling_ratio/mean": 0.770960807800293, "sampling/importance_sampling_ratio/min": 3.691224572093671e-14, "sampling/sampling_logp_difference/max": 13.41136360168457, "sampling/sampling_logp_difference/mean": 0.25026795268058777, "step": 1031, "step_time": 9.073410536997471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008854166837409139, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008854166837409139, "entropy": 0.8229270670562983, "epoch": 0.01032, "grad_norm": 0.1478547751903534, "kl": 0.7909552790224552, "learning_rate": 9.999885244502138e-06, "loss": -0.0647, "step": 1032, "step_time": 5.476290307000454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 184.1875, "completions/mean_terminated_length": 184.1875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7151280120015144, "epoch": 0.01033, "frac_reward_zero_std": 0.25, "grad_norm": 0.16740655899047852, "kl": 0.9157835533842444, "learning_rate": 9.999885013954911e-06, "loss": 0.0182, "num_tokens": 24233035.0, "reward": 2.709960460662842, "reward_std": 0.23257064819335938, "rewards/rollout_reward_func/mean": 2.709960460662842, "rewards/rollout_reward_func/std": 3.1489720344543457, "sampling/importance_sampling_ratio/max": 1.0478925704956055, "sampling/importance_sampling_ratio/mean": 0.8362979888916016, "sampling/importance_sampling_ratio/min": 0.024777254089713097, "sampling/sampling_logp_difference/max": 2.2521049976348877, "sampling/sampling_logp_difference/mean": 0.08214061707258224, "step": 1033, "step_time": 7.767564765998031 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.7223320212215185, "epoch": 0.01034, "grad_norm": 0.27918246388435364, "kl": 0.9207731885835528, "learning_rate": 9.99988478317633e-06, "loss": 0.018, "step": 1034, "step_time": 4.12305995099814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008801020449027419, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008801020449027419, "completions/clipped_ratio": 0.0625, "completions/max_length": 1583.0, "completions/max_terminated_length": 1175.0, "completions/mean_length": 463.21875, "completions/mean_terminated_length": 427.13336181640625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.484673857688904, "epoch": 0.01035, "frac_reward_zero_std": 0.0, "grad_norm": 0.19268903136253357, "kl": 1.3274744376540184, "learning_rate": 9.999884552166398e-06, "loss": -0.0423, "num_tokens": 24285822.0, "reward": -0.5836915969848633, "reward_std": 2.509092330932617, "rewards/rollout_reward_func/mean": -0.5836915969848633, "rewards/rollout_reward_func/std": 3.2464873790740967, "sampling/importance_sampling_ratio/max": 0.9520483613014221, "sampling/importance_sampling_ratio/mean": 0.3294222354888916, "sampling/importance_sampling_ratio/min": 6.258057704455699e-30, "sampling/sampling_logp_difference/max": 12.08073902130127, "sampling/sampling_logp_difference/mean": 0.5718346834182739, "step": 1035, "step_time": 12.35642976699637 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.013366284198127687, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.021178784198127687, "entropy": 2.4639419317245483, "epoch": 0.01036, "grad_norm": 0.17412680387496948, "kl": 1.43195129185915, "learning_rate": 9.999884320925112e-06, "loss": -0.0421, "step": 1036, "step_time": 6.717434157999378 }, { "clip_ratio/high_max": 0.018716577906161547, "clip_ratio/high_mean": 0.009358288953080773, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009358288953080773, "completions/clipped_ratio": 0.0, "completions/max_length": 1266.0, "completions/max_terminated_length": 1266.0, "completions/mean_length": 380.5625, "completions/mean_terminated_length": 380.5625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4362283572554588, "epoch": 0.01037, "frac_reward_zero_std": 0.25, "grad_norm": 0.27200189232826233, "kl": 1.326272850856185, "learning_rate": 9.999884089452478e-06, "loss": -0.0481, "num_tokens": 24333168.0, "reward": 2.449686050415039, "reward_std": 2.727585792541504, "rewards/rollout_reward_func/mean": 2.449686050415039, "rewards/rollout_reward_func/std": 3.175535202026367, "sampling/importance_sampling_ratio/max": 0.9757931232452393, "sampling/importance_sampling_ratio/mean": 0.66913902759552, "sampling/importance_sampling_ratio/min": 9.806228745690149e-12, "sampling/sampling_logp_difference/max": 9.178159713745117, "sampling/sampling_logp_difference/mean": 0.33109551668167114, "step": 1037, "step_time": 11.091796473994691 }, { "clip_ratio/high_max": 0.014705882407724857, "clip_ratio/high_mean": 0.007352941203862429, "clip_ratio/low_mean": 0.011363636702299118, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018716577906161547, "entropy": 1.464474730193615, "epoch": 0.01038, "grad_norm": 0.19479520618915558, "kl": 1.3493984397500753, "learning_rate": 9.999883857748489e-06, "loss": -0.0489, "step": 1038, "step_time": 5.912744421997559 }, { "clip_ratio/high_max": 0.02638888917863369, "clip_ratio/high_mean": 0.013194444589316845, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.019444444682449102, "completions/clipped_ratio": 0.0, "completions/max_length": 1205.0, "completions/max_terminated_length": 1205.0, "completions/mean_length": 398.625, "completions/mean_terminated_length": 398.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.0966560058295727, "epoch": 0.01039, "frac_reward_zero_std": 0.0, "grad_norm": 0.4572112560272217, "kl": 1.2852685451507568, "learning_rate": 9.99988362581315e-06, "loss": -0.0283, "num_tokens": 24383017.0, "reward": -0.6460349559783936, "reward_std": 2.562570810317993, "rewards/rollout_reward_func/mean": -0.6460349559783936, "rewards/rollout_reward_func/std": 3.0921618938446045, "sampling/importance_sampling_ratio/max": 1.0871409177780151, "sampling/importance_sampling_ratio/mean": 0.7622125148773193, "sampling/importance_sampling_ratio/min": 0.000958822260145098, "sampling/sampling_logp_difference/max": 2.143747091293335, "sampling/sampling_logp_difference/mean": 0.14508353173732758, "step": 1039, "step_time": 11.145130868004344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.02500000037252903, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.02500000037252903, "entropy": 1.0874073952436447, "epoch": 0.0104, "grad_norm": 0.48356518149375916, "kl": 1.3131954595446587, "learning_rate": 9.999883393646456e-06, "loss": -0.0306, "step": 1040, "step_time": 5.810489042998597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0026041667442768812, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0026041667442768812, "completions/clipped_ratio": 0.0, "completions/max_length": 1021.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.5579002052545547, "epoch": 0.01041, "frac_reward_zero_std": 0.25, "grad_norm": 0.22243383526802063, "kl": 0.5724125020205975, "learning_rate": 9.999883161248411e-06, "loss": -0.0366, "num_tokens": 24428337.0, "reward": 2.5934014320373535, "reward_std": 0.29875418543815613, "rewards/rollout_reward_func/mean": 2.5934014320373535, "rewards/rollout_reward_func/std": 3.1234991550445557, "sampling/importance_sampling_ratio/max": 0.9755913019180298, "sampling/importance_sampling_ratio/mean": 0.8539390563964844, "sampling/importance_sampling_ratio/min": 0.009622056037187576, "sampling/sampling_logp_difference/max": 1.5138144493103027, "sampling/sampling_logp_difference/mean": 0.0619904100894928, "step": 1041, "step_time": 10.005912541002544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.552031522616744, "epoch": 0.01042, "grad_norm": 0.22654804587364197, "kl": 0.5761129595339298, "learning_rate": 9.999882928619013e-06, "loss": -0.037, "step": 1042, "step_time": 5.345685962000061 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "completions/clipped_ratio": 0.0, "completions/max_length": 1177.0, "completions/max_terminated_length": 1177.0, "completions/mean_length": 425.375, "completions/mean_terminated_length": 425.375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8389484882354736, "epoch": 0.01043, "frac_reward_zero_std": 0.0, "grad_norm": 0.4592910408973694, "kl": 1.0373447462916374, "learning_rate": 9.999882695758263e-06, "loss": -0.0401, "num_tokens": 24480763.0, "reward": 1.7834044694900513, "reward_std": 3.407888174057007, "rewards/rollout_reward_func/mean": 1.7834044694900513, "rewards/rollout_reward_func/std": 3.520643472671509, "sampling/importance_sampling_ratio/max": 1.00213623046875, "sampling/importance_sampling_ratio/mean": 0.7423892021179199, "sampling/importance_sampling_ratio/min": 0.012074703350663185, "sampling/sampling_logp_difference/max": 2.055992841720581, "sampling/sampling_logp_difference/mean": 0.11724179983139038, "step": 1043, "step_time": 11.142027798001436 }, { "clip_ratio/high_max": 0.02003205195069313, "clip_ratio/high_mean": 0.010016025975346565, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010016025975346565, "entropy": 0.8632461614906788, "epoch": 0.01044, "grad_norm": 0.5315345525741577, "kl": 1.0364739745855331, "learning_rate": 9.999882462666162e-06, "loss": -0.0425, "step": 1044, "step_time": 5.753233724000893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1376.0, "completions/max_terminated_length": 1376.0, "completions/mean_length": 293.34375, "completions/mean_terminated_length": 293.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4175494350492954, "epoch": 0.01045, "frac_reward_zero_std": 0.25, "grad_norm": 0.42524170875549316, "kl": 1.2399295195937157, "learning_rate": 9.999882229342708e-06, "loss": -0.0445, "num_tokens": 24523987.0, "reward": 1.2681267261505127, "reward_std": 2.5586097240448, "rewards/rollout_reward_func/mean": 1.2681267261505127, "rewards/rollout_reward_func/std": 3.568194627761841, "sampling/importance_sampling_ratio/max": 1.0943660736083984, "sampling/importance_sampling_ratio/mean": 0.6711118817329407, "sampling/importance_sampling_ratio/min": 0.0007623870624229312, "sampling/sampling_logp_difference/max": 2.5233726501464844, "sampling/sampling_logp_difference/mean": 0.1899225413799286, "step": 1045, "step_time": 11.301179480999053 }, { "clip_ratio/high_max": 0.03428030386567116, "clip_ratio/high_mean": 0.01714015193283558, "clip_ratio/low_mean": 0.015224359463900328, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.03236451139673591, "entropy": 1.433868981897831, "epoch": 0.01046, "grad_norm": 0.15482330322265625, "kl": 1.253632701933384, "learning_rate": 9.999881995787903e-06, "loss": -0.0456, "step": 1046, "step_time": 6.0897364519987605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1667.0, "completions/max_terminated_length": 1667.0, "completions/mean_length": 330.21875, "completions/mean_terminated_length": 330.21875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9586048014461994, "epoch": 0.01047, "frac_reward_zero_std": 0.25, "grad_norm": 0.4337850511074066, "kl": 0.9562774924561381, "learning_rate": 9.999881762001746e-06, "loss": -0.0404, "num_tokens": 24570609.0, "reward": 2.0625863075256348, "reward_std": 1.8557686805725098, "rewards/rollout_reward_func/mean": 2.0625863075256348, "rewards/rollout_reward_func/std": 3.46465802192688, "sampling/importance_sampling_ratio/max": 1.4265344142913818, "sampling/importance_sampling_ratio/mean": 0.7813040018081665, "sampling/importance_sampling_ratio/min": 0.007003336679190397, "sampling/sampling_logp_difference/max": 1.8608570098876953, "sampling/sampling_logp_difference/mean": 0.12385775148868561, "step": 1047, "step_time": 11.471976578999602 }, { "clip_ratio/high_max": 0.00657894741743803, "clip_ratio/high_mean": 0.003289473708719015, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.003289473708719015, "entropy": 0.9531217478215694, "epoch": 0.01048, "grad_norm": 0.35198062658309937, "kl": 0.9522557007148862, "learning_rate": 9.999881527984237e-06, "loss": -0.0422, "step": 1048, "step_time": 7.2455882520025625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 151.28125, "completions/mean_terminated_length": 151.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9476274196058512, "epoch": 0.01049, "frac_reward_zero_std": 0.5, "grad_norm": 0.28592613339424133, "kl": 0.6299678329378366, "learning_rate": 9.999881293735373e-06, "loss": -0.0199, "num_tokens": 24608343.0, "reward": 3.31489896774292, "reward_std": 1.4752442836761475, "rewards/rollout_reward_func/mean": 3.31489896774292, "rewards/rollout_reward_func/std": 2.1905124187469482, "sampling/importance_sampling_ratio/max": 0.999194324016571, "sampling/importance_sampling_ratio/mean": 0.7656769752502441, "sampling/importance_sampling_ratio/min": 0.00659216521307826, "sampling/sampling_logp_difference/max": 2.0680811405181885, "sampling/sampling_logp_difference/mean": 0.10189880430698395, "step": 1049, "step_time": 10.146646552000675 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.0062500000931322575, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014062500093132257, "entropy": 0.9262179620563984, "epoch": 0.0105, "grad_norm": 0.21421322226524353, "kl": 0.6375158429145813, "learning_rate": 9.99988105925516e-06, "loss": -0.0204, "step": 1050, "step_time": 6.262001913999484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013888888992369175, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013888888992369175, "completions/clipped_ratio": 0.0, "completions/max_length": 473.0, "completions/max_terminated_length": 473.0, "completions/mean_length": 84.28125, "completions/mean_terminated_length": 84.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.289922147989273, "epoch": 0.01051, "frac_reward_zero_std": 0.5, "grad_norm": 0.06159602850675583, "kl": 0.702265917789191, "learning_rate": 9.999880824543595e-06, "loss": -0.0346, "num_tokens": 24643448.0, "reward": 1.6580784320831299, "reward_std": 1.0811247825622559, "rewards/rollout_reward_func/mean": 1.6580784320831299, "rewards/rollout_reward_func/std": 3.51531720161438, "sampling/importance_sampling_ratio/max": 0.9739121198654175, "sampling/importance_sampling_ratio/mean": 0.7230418920516968, "sampling/importance_sampling_ratio/min": 0.00844400841742754, "sampling/sampling_logp_difference/max": 2.0794525146484375, "sampling/sampling_logp_difference/mean": 0.17468178272247314, "step": 1051, "step_time": 7.0346737259988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.268346980214119, "epoch": 0.01052, "grad_norm": 0.059036508202552795, "kl": 0.7143791019916534, "learning_rate": 9.999880589600677e-06, "loss": -0.0345, "step": 1052, "step_time": 4.582035724000889 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011979166883975267, "completions/clipped_ratio": 0.0, "completions/max_length": 1016.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 284.15625, "completions/mean_terminated_length": 284.15625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.1415796540677547, "epoch": 0.01053, "frac_reward_zero_std": 0.5, "grad_norm": 0.2900788187980652, "kl": 0.8491765186190605, "learning_rate": 9.999880354426408e-06, "loss": -0.0419, "num_tokens": 24686636.0, "reward": 2.544476270675659, "reward_std": 2.0058321952819824, "rewards/rollout_reward_func/mean": 2.544476270675659, "rewards/rollout_reward_func/std": 3.0841197967529297, "sampling/importance_sampling_ratio/max": 0.9765169620513916, "sampling/importance_sampling_ratio/mean": 0.7146575450897217, "sampling/importance_sampling_ratio/min": 0.004709345754235983, "sampling/sampling_logp_difference/max": 1.8924354314804077, "sampling/sampling_logp_difference/mean": 0.16849428415298462, "step": 1053, "step_time": 9.526362419002908 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.0078125, "clip_ratio/low_mean": 0.008333333767950535, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016145833767950535, "entropy": 1.1206730119884014, "epoch": 0.01054, "grad_norm": 0.29071399569511414, "kl": 0.8429350880905986, "learning_rate": 9.999880119020786e-06, "loss": -0.0431, "step": 1054, "step_time": 5.77754700100013 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "completions/clipped_ratio": 0.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 288.34375, "completions/mean_terminated_length": 288.34375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.6467370428144932, "epoch": 0.01055, "frac_reward_zero_std": 0.5, "grad_norm": 0.30889302492141724, "kl": 0.6754206158220768, "learning_rate": 9.999879883383812e-06, "loss": -0.0285, "num_tokens": 24731230.0, "reward": 2.903134822845459, "reward_std": 1.7930394411087036, "rewards/rollout_reward_func/mean": 2.903134822845459, "rewards/rollout_reward_func/std": 2.6857290267944336, "sampling/importance_sampling_ratio/max": 1.0312460660934448, "sampling/importance_sampling_ratio/mean": 0.8224341869354248, "sampling/importance_sampling_ratio/min": 0.07807759195566177, "sampling/sampling_logp_difference/max": 1.9683910608291626, "sampling/sampling_logp_difference/mean": 0.0637407898902893, "step": 1055, "step_time": 10.605783512000926 }, { "clip_ratio/high_max": 0.009615384973585606, "clip_ratio/high_mean": 0.004807692486792803, "clip_ratio/low_mean": 0.008928571827709675, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013736264314502478, "entropy": 0.6369641236960888, "epoch": 0.01056, "grad_norm": 0.23091213405132294, "kl": 0.6768921762704849, "learning_rate": 9.999879647515487e-06, "loss": -0.0291, "step": 1056, "step_time": 6.211502075997487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 151.59375, "completions/mean_terminated_length": 151.59375, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9296255707740784, "epoch": 0.01057, "frac_reward_zero_std": 0.25, "grad_norm": 0.15702438354492188, "kl": 0.881830932572484, "learning_rate": 9.99987941141581e-06, "loss": -0.0678, "num_tokens": 24771419.0, "reward": 1.2717795372009277, "reward_std": 3.1064162254333496, "rewards/rollout_reward_func/mean": 1.2717795372009277, "rewards/rollout_reward_func/std": 3.7909634113311768, "sampling/importance_sampling_ratio/max": 0.9778826832771301, "sampling/importance_sampling_ratio/mean": 0.791786253452301, "sampling/importance_sampling_ratio/min": 0.012164168059825897, "sampling/sampling_logp_difference/max": 2.4565932750701904, "sampling/sampling_logp_difference/mean": 0.13880035281181335, "step": 1057, "step_time": 8.20396552099919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9246207606047392, "epoch": 0.01058, "grad_norm": 0.16121646761894226, "kl": 0.844249208457768, "learning_rate": 9.99987917508478e-06, "loss": -0.0677, "step": 1058, "step_time": 4.869609972000035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1798.0, "completions/max_terminated_length": 1798.0, "completions/mean_length": 251.96875, "completions/mean_terminated_length": 251.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.45601678639650345, "epoch": 0.01059, "frac_reward_zero_std": 0.25, "grad_norm": 0.24805283546447754, "kl": 0.8385537061840296, "learning_rate": 9.9998789385224e-06, "loss": -0.0188, "num_tokens": 24814226.0, "reward": 3.7750515937805176, "reward_std": 1.2234163284301758, "rewards/rollout_reward_func/mean": 3.7750515937805176, "rewards/rollout_reward_func/std": 1.9720282554626465, "sampling/importance_sampling_ratio/max": 0.9763563275337219, "sampling/importance_sampling_ratio/mean": 0.8866156339645386, "sampling/importance_sampling_ratio/min": 0.09810473769903183, "sampling/sampling_logp_difference/max": 0.7928142547607422, "sampling/sampling_logp_difference/mean": 0.038742464035749435, "step": 1059, "step_time": 12.045122618004825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.4509176928550005, "epoch": 0.0106, "grad_norm": 0.22798573970794678, "kl": 0.8673081351444125, "learning_rate": 9.999878701728665e-06, "loss": -0.0192, "step": 1060, "step_time": 7.211371843000961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1281.0, "completions/max_terminated_length": 1281.0, "completions/mean_length": 388.90625, "completions/mean_terminated_length": 388.90625, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.042051613330841, "epoch": 0.01061, "frac_reward_zero_std": 0.0, "grad_norm": 0.30432626605033875, "kl": 1.283546794205904, "learning_rate": 9.999878464703582e-06, "loss": -0.0387, "num_tokens": 24864998.0, "reward": -0.02222844958305359, "reward_std": 2.308894634246826, "rewards/rollout_reward_func/mean": -0.02222844958305359, "rewards/rollout_reward_func/std": 3.1494383811950684, "sampling/importance_sampling_ratio/max": 1.0052210092544556, "sampling/importance_sampling_ratio/mean": 0.731398344039917, "sampling/importance_sampling_ratio/min": 0.010547916404902935, "sampling/sampling_logp_difference/max": 1.9335572719573975, "sampling/sampling_logp_difference/mean": 0.126013845205307, "step": 1061, "step_time": 11.059742010000264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.056759025901556, "epoch": 0.01062, "grad_norm": 0.2818869948387146, "kl": 1.2917816825211048, "learning_rate": 9.999878227447144e-06, "loss": -0.0393, "step": 1062, "step_time": 5.982023150996611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0021551724057644606, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0021551724057644606, "completions/clipped_ratio": 0.0, "completions/max_length": 2207.0, "completions/max_terminated_length": 2207.0, "completions/mean_length": 624.90625, "completions/mean_terminated_length": 624.90625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8701673895120621, "epoch": 0.01063, "frac_reward_zero_std": 0.0, "grad_norm": 0.2584298849105835, "kl": 0.8185978382825851, "learning_rate": 9.999877989959355e-06, "loss": -0.0626, "num_tokens": 24922136.0, "reward": 1.7852084636688232, "reward_std": 3.158923864364624, "rewards/rollout_reward_func/mean": 1.7852084636688232, "rewards/rollout_reward_func/std": 3.6033947467803955, "sampling/importance_sampling_ratio/max": 0.9761104583740234, "sampling/importance_sampling_ratio/mean": 0.6949251294136047, "sampling/importance_sampling_ratio/min": 6.825129438157095e-17, "sampling/sampling_logp_difference/max": 17.094655990600586, "sampling/sampling_logp_difference/mean": 0.3120982348918915, "step": 1063, "step_time": 15.239915228001337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8960599470883608, "epoch": 0.01064, "grad_norm": 0.24614304304122925, "kl": 0.814337033778429, "learning_rate": 9.999877752240214e-06, "loss": -0.0633, "step": 1064, "step_time": 8.384655386002123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 414.125, "completions/mean_terminated_length": 414.125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.8514130879193544, "epoch": 0.01065, "frac_reward_zero_std": 0.0, "grad_norm": 0.09087374061346054, "kl": 0.7092932444065809, "learning_rate": 9.999877514289723e-06, "loss": -0.0702, "num_tokens": 24973638.0, "reward": 3.250063180923462, "reward_std": 2.259063959121704, "rewards/rollout_reward_func/mean": 3.250063180923462, "rewards/rollout_reward_func/std": 2.7767932415008545, "sampling/importance_sampling_ratio/max": 0.9907817244529724, "sampling/importance_sampling_ratio/mean": 0.7723124027252197, "sampling/importance_sampling_ratio/min": 0.011370335705578327, "sampling/sampling_logp_difference/max": 1.8126689195632935, "sampling/sampling_logp_difference/mean": 0.11728214472532272, "step": 1065, "step_time": 10.465860888998577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.8609724529087543, "epoch": 0.01066, "grad_norm": 0.09524855017662048, "kl": 0.71302430331707, "learning_rate": 9.999877276107878e-06, "loss": -0.0703, "step": 1066, "step_time": 5.814804788997208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 372.0, "completions/max_terminated_length": 372.0, "completions/mean_length": 89.375, "completions/mean_terminated_length": 89.375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.17917237803339958, "epoch": 0.01067, "frac_reward_zero_std": 0.75, "grad_norm": 0.006979926023632288, "kl": 0.6122120497748256, "learning_rate": 9.999877037694683e-06, "loss": 0.0014, "num_tokens": 25009180.0, "reward": 4.1990647315979, "reward_std": 0.009317704476416111, "rewards/rollout_reward_func/mean": 4.1990647315979, "rewards/rollout_reward_func/std": 0.3509596586227417, "sampling/importance_sampling_ratio/max": 0.9857180118560791, "sampling/importance_sampling_ratio/mean": 0.9645295143127441, "sampling/importance_sampling_ratio/min": 0.9335914254188538, "sampling/sampling_logp_difference/max": 0.04571753740310669, "sampling/sampling_logp_difference/mean": 0.010641360655426979, "step": 1067, "step_time": 7.739893464002307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.18071816861629486, "epoch": 0.01068, "grad_norm": 0.005383663810789585, "kl": 0.6121987025253475, "learning_rate": 9.999876799050135e-06, "loss": 0.0014, "step": 1068, "step_time": 3.9578678580019186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1536.0, "completions/max_terminated_length": 1536.0, "completions/mean_length": 619.9375, "completions/mean_terminated_length": 619.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9674080796539783, "epoch": 0.01069, "frac_reward_zero_std": 0.25, "grad_norm": 0.23518958687782288, "kl": 1.05471595749259, "learning_rate": 9.999876560174237e-06, "loss": -0.0357, "num_tokens": 25066927.0, "reward": 2.283459186553955, "reward_std": 2.4819514751434326, "rewards/rollout_reward_func/mean": 2.283459186553955, "rewards/rollout_reward_func/std": 3.1659576892852783, "sampling/importance_sampling_ratio/max": 0.9770911931991577, "sampling/importance_sampling_ratio/mean": 0.6726330518722534, "sampling/importance_sampling_ratio/min": 0.014115001074969769, "sampling/sampling_logp_difference/max": 2.5573036670684814, "sampling/sampling_logp_difference/mean": 0.1261601746082306, "step": 1069, "step_time": 12.60125443799916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 0.9843106605112553, "epoch": 0.0107, "grad_norm": 0.21961642801761627, "kl": 1.0339947007596493, "learning_rate": 9.999876321066986e-06, "loss": -0.0358, "step": 1070, "step_time": 6.639171519002048 }, { "clip_ratio/high_max": 0.010869565419852734, "clip_ratio/high_mean": 0.005434782709926367, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01064311619848013, "completions/clipped_ratio": 0.0, "completions/max_length": 1293.0, "completions/max_terminated_length": 1293.0, "completions/mean_length": 458.5, "completions/mean_terminated_length": 458.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.2761946953833103, "epoch": 0.01071, "frac_reward_zero_std": 0.0, "grad_norm": 0.48311668634414673, "kl": 1.4492971673607826, "learning_rate": 9.999876081728383e-06, "loss": -0.0616, "num_tokens": 25119978.0, "reward": 2.74078106880188, "reward_std": 3.2862424850463867, "rewards/rollout_reward_func/mean": 2.74078106880188, "rewards/rollout_reward_func/std": 3.21321439743042, "sampling/importance_sampling_ratio/max": 1.003676414489746, "sampling/importance_sampling_ratio/mean": 0.6378576755523682, "sampling/importance_sampling_ratio/min": 1.8568064414407426e-23, "sampling/sampling_logp_difference/max": 20.338211059570312, "sampling/sampling_logp_difference/mean": 0.4250069260597229, "step": 1071, "step_time": 11.465755472998353 }, { "clip_ratio/high_max": 0.023863636888563633, "clip_ratio/high_mean": 0.011931818444281816, "clip_ratio/low_mean": 0.01145833358168602, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.023390152025967836, "entropy": 1.2676998600363731, "epoch": 0.01072, "grad_norm": 0.2466934472322464, "kl": 1.4486276060342789, "learning_rate": 9.999875842158429e-06, "loss": -0.0635, "step": 1072, "step_time": 6.130033733001255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 442.28125, "completions/mean_terminated_length": 442.28125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9678929820656776, "epoch": 0.01073, "frac_reward_zero_std": 0.25, "grad_norm": 0.23519772291183472, "kl": 1.0561959967017174, "learning_rate": 9.999875602357123e-06, "loss": -0.0471, "num_tokens": 25169176.0, "reward": 2.6777961254119873, "reward_std": 1.7397934198379517, "rewards/rollout_reward_func/mean": 2.6777961254119873, "rewards/rollout_reward_func/std": 2.8722140789031982, "sampling/importance_sampling_ratio/max": 0.9746662378311157, "sampling/importance_sampling_ratio/mean": 0.7080720663070679, "sampling/importance_sampling_ratio/min": 5.11838398982369e-16, "sampling/sampling_logp_difference/max": 15.124319076538086, "sampling/sampling_logp_difference/mean": 0.30193832516670227, "step": 1073, "step_time": 11.232520270999885 }, { "clip_ratio/high_max": 0.004464285913854837, "clip_ratio/high_mean": 0.0022321429569274187, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.007913961308076978, "entropy": 0.9806153997778893, "epoch": 0.01074, "grad_norm": 0.2420676201581955, "kl": 1.0776590779423714, "learning_rate": 9.999875362324464e-06, "loss": -0.0474, "step": 1074, "step_time": 5.9671053839992965 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "completions/clipped_ratio": 0.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 653.46875, "completions/mean_terminated_length": 653.46875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.2035584039986134, "epoch": 0.01075, "frac_reward_zero_std": 0.0, "grad_norm": 0.37771329283714294, "kl": 1.4321660250425339, "learning_rate": 9.999875122060455e-06, "loss": -0.0467, "num_tokens": 25228824.0, "reward": -0.04661870002746582, "reward_std": 1.988628625869751, "rewards/rollout_reward_func/mean": -0.04661870002746582, "rewards/rollout_reward_func/std": 3.253267765045166, "sampling/importance_sampling_ratio/max": 0.9738749861717224, "sampling/importance_sampling_ratio/mean": 0.6536439657211304, "sampling/importance_sampling_ratio/min": 0.005202142987400293, "sampling/sampling_logp_difference/max": 2.537452220916748, "sampling/sampling_logp_difference/mean": 0.1560591459274292, "step": 1075, "step_time": 11.046929257003285 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0062500000931322575, "entropy": 1.2271618582308292, "epoch": 0.01076, "grad_norm": 0.3656533360481262, "kl": 1.4440756663680077, "learning_rate": 9.999874881565094e-06, "loss": -0.0481, "step": 1076, "step_time": 6.642859501002022 }, { "clip_ratio/high_max": 0.0062500000931322575, "clip_ratio/high_mean": 0.0031250000465661287, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0031250000465661287, "completions/clipped_ratio": 0.0, "completions/max_length": 1014.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 260.53125, "completions/mean_terminated_length": 260.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.6659094914793968, "epoch": 0.01077, "frac_reward_zero_std": 0.25, "grad_norm": 0.5293510556221008, "kl": 0.8358374387025833, "learning_rate": 9.999874640838381e-06, "loss": -0.0015, "num_tokens": 25272372.0, "reward": 2.912506103515625, "reward_std": 1.7118818759918213, "rewards/rollout_reward_func/mean": 2.912506103515625, "rewards/rollout_reward_func/std": 2.7508699893951416, "sampling/importance_sampling_ratio/max": 0.9777252674102783, "sampling/importance_sampling_ratio/mean": 0.8257267475128174, "sampling/importance_sampling_ratio/min": 0.0010591759346425533, "sampling/sampling_logp_difference/max": 1.6430673599243164, "sampling/sampling_logp_difference/mean": 0.09120073169469833, "step": 1077, "step_time": 9.75107744700108 }, { "clip_ratio/high_max": 0.015625, "clip_ratio/high_mean": 0.01406249962747097, "clip_ratio/low_mean": 0.0546875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.06874999962747097, "entropy": 0.8278390914201736, "epoch": 0.01078, "grad_norm": 0.19915568828582764, "kl": 0.8573692012578249, "learning_rate": 9.999874399880319e-06, "loss": -0.0031, "step": 1078, "step_time": 5.238299864997316 }, { "clip_ratio/high_max": 0.03125, "clip_ratio/high_mean": 0.015625, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.020432692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 946.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 307.6875, "completions/mean_terminated_length": 307.6875, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.262627735733986, "epoch": 0.01079, "frac_reward_zero_std": 0.25, "grad_norm": 0.2610866129398346, "kl": 1.480452612042427, "learning_rate": 9.999874158690902e-06, "loss": -0.0459, "num_tokens": 25318114.0, "reward": 2.695936918258667, "reward_std": 2.621896743774414, "rewards/rollout_reward_func/mean": 2.695936918258667, "rewards/rollout_reward_func/std": 3.238097667694092, "sampling/importance_sampling_ratio/max": 0.9762461185455322, "sampling/importance_sampling_ratio/mean": 0.7081501483917236, "sampling/importance_sampling_ratio/min": 0.008211780339479446, "sampling/sampling_logp_difference/max": 2.71400785446167, "sampling/sampling_logp_difference/mean": 0.16934503614902496, "step": 1079, "step_time": 10.0087287189981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010489510837942362, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010489510837942362, "entropy": 1.2644641771912575, "epoch": 0.0108, "grad_norm": 0.2565755844116211, "kl": 1.475633643567562, "learning_rate": 9.999873917270134e-06, "loss": -0.0474, "step": 1080, "step_time": 5.821556737000719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 350.0625, "completions/mean_terminated_length": 360.83868408203125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.9269855841994286, "epoch": 0.01081, "frac_reward_zero_std": 0.0, "grad_norm": 0.3900070786476135, "kl": 1.5635296702384949, "learning_rate": 9.999873675618016e-06, "loss": -0.0718, "num_tokens": 25366464.0, "reward": -0.03221738338470459, "reward_std": 3.2869389057159424, "rewards/rollout_reward_func/mean": -0.03221738338470459, "rewards/rollout_reward_func/std": 3.6699399948120117, "sampling/importance_sampling_ratio/max": 0.9737862348556519, "sampling/importance_sampling_ratio/mean": 0.6080856919288635, "sampling/importance_sampling_ratio/min": 0.0006178817129693925, "sampling/sampling_logp_difference/max": 3.2592616081237793, "sampling/sampling_logp_difference/mean": 0.2540919780731201, "step": 1081, "step_time": 10.910317936002684 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.009661835851147771, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.014870169339701533, "entropy": 1.937621422111988, "epoch": 0.01082, "grad_norm": 0.1983168125152588, "kl": 1.556481271982193, "learning_rate": 9.999873433734544e-06, "loss": -0.0725, "step": 1082, "step_time": 5.777244002996667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004166666883975267, "completions/clipped_ratio": 0.03125, "completions/max_length": 1520.0, "completions/max_terminated_length": 1520.0, "completions/mean_length": 482.0, "completions/mean_terminated_length": 462.4838562011719, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5874536037445068, "epoch": 0.01083, "frac_reward_zero_std": 0.0, "grad_norm": 5.148405075073242, "kl": 7.84514918550849, "learning_rate": 9.999873191619722e-06, "loss": 0.0013, "num_tokens": 25416867.0, "reward": 1.9094817638397217, "reward_std": 2.9603090286254883, "rewards/rollout_reward_func/mean": 1.9094817638397217, "rewards/rollout_reward_func/std": 3.1176443099975586, "sampling/importance_sampling_ratio/max": 0.9753654599189758, "sampling/importance_sampling_ratio/mean": 0.5155235528945923, "sampling/importance_sampling_ratio/min": 1.0255391190132847e-16, "sampling/sampling_logp_difference/max": 16.252017974853516, "sampling/sampling_logp_difference/mean": 0.3685303032398224, "step": 1083, "step_time": 11.716461264002646 }, { "clip_ratio/high_max": 0.008333333767950535, "clip_ratio/high_mean": 0.004166666883975267, "clip_ratio/low_mean": 0.004166666883975267, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008333333767950535, "entropy": 1.6686824709177017, "epoch": 0.01084, "grad_norm": 0.7304099798202515, "kl": 1.9601918049156666, "learning_rate": 9.99987294927355e-06, "loss": -0.0256, "step": 1084, "step_time": 6.971407236002051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0078125, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0078125, "completions/clipped_ratio": 0.0, "completions/max_length": 446.0, "completions/max_terminated_length": 446.0, "completions/mean_length": 21.59375, "completions/mean_terminated_length": 21.59375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5454966723918915, "epoch": 0.01085, "frac_reward_zero_std": 0.25, "grad_norm": 0.2208191603422165, "kl": 0.506733700633049, "learning_rate": 9.999872706696024e-06, "loss": -0.0275, "num_tokens": 25448256.0, "reward": 1.910880208015442, "reward_std": 1.9037115573883057, "rewards/rollout_reward_func/mean": 1.910880208015442, "rewards/rollout_reward_func/std": 3.184511661529541, "sampling/importance_sampling_ratio/max": 0.9769799113273621, "sampling/importance_sampling_ratio/mean": 0.683721125125885, "sampling/importance_sampling_ratio/min": 0.018718929961323738, "sampling/sampling_logp_difference/max": 1.7370553016662598, "sampling/sampling_logp_difference/mean": 0.1824752539396286, "step": 1085, "step_time": 6.999079437999171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "entropy": 1.5626187920570374, "epoch": 0.01086, "grad_norm": 0.22210593521595, "kl": 0.5168629828840494, "learning_rate": 9.999872463887149e-06, "loss": -0.0279, "step": 1086, "step_time": 4.20091679899997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1338.0, "completions/max_terminated_length": 1338.0, "completions/mean_length": 271.9375, "completions/mean_terminated_length": 271.9375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7853275574743748, "epoch": 0.01087, "frac_reward_zero_std": 0.5, "grad_norm": 0.34647929668426514, "kl": 1.0800634995102882, "learning_rate": 9.99987222084692e-06, "loss": -0.0335, "num_tokens": 25490149.0, "reward": 3.541121482849121, "reward_std": 1.611525058746338, "rewards/rollout_reward_func/mean": 3.541121482849121, "rewards/rollout_reward_func/std": 2.3445076942443848, "sampling/importance_sampling_ratio/max": 1.182264804840088, "sampling/importance_sampling_ratio/mean": 0.860588014125824, "sampling/importance_sampling_ratio/min": 0.001797960721887648, "sampling/sampling_logp_difference/max": 1.5830892324447632, "sampling/sampling_logp_difference/mean": 0.10914543271064758, "step": 1087, "step_time": 10.810674510001263 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "entropy": 0.8064879458397627, "epoch": 0.01088, "grad_norm": 0.3406504690647125, "kl": 1.0816296711564064, "learning_rate": 9.99987197757534e-06, "loss": -0.0335, "step": 1088, "step_time": 6.60904669699994 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "completions/clipped_ratio": 0.0, "completions/max_length": 1684.0, "completions/max_terminated_length": 1684.0, "completions/mean_length": 575.78125, "completions/mean_terminated_length": 575.78125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 2.110680304467678, "epoch": 0.01089, "frac_reward_zero_std": 0.0, "grad_norm": 0.31741219758987427, "kl": 1.781504973769188, "learning_rate": 9.99987173407241e-06, "loss": -0.0262, "num_tokens": 25544861.0, "reward": 1.8691704273223877, "reward_std": 3.5066676139831543, "rewards/rollout_reward_func/mean": 1.8691704273223877, "rewards/rollout_reward_func/std": 3.4734902381896973, "sampling/importance_sampling_ratio/max": 0.9758496284484863, "sampling/importance_sampling_ratio/mean": 0.46594130992889404, "sampling/importance_sampling_ratio/min": 0.000370785070117563, "sampling/sampling_logp_difference/max": 3.2294681072235107, "sampling/sampling_logp_difference/mean": 0.28037500381469727, "step": 1089, "step_time": 12.627237418999357 }, { "clip_ratio/high_max": 0.0078125, "clip_ratio/high_mean": 0.00390625, "clip_ratio/low_mean": 0.0052083334885537624, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.009114583488553762, "entropy": 2.1016275137662888, "epoch": 0.0109, "grad_norm": 0.24792084097862244, "kl": 1.7529390454292297, "learning_rate": 9.999871490338127e-06, "loss": -0.0267, "step": 1090, "step_time": 7.294274443000177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024038462433964014, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024038462433964014, "completions/clipped_ratio": 0.0, "completions/max_length": 1136.0, "completions/max_terminated_length": 1136.0, "completions/mean_length": 294.4375, "completions/mean_terminated_length": 294.4375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.9622202962636948, "epoch": 0.01091, "frac_reward_zero_std": 0.0, "grad_norm": 0.12721112370491028, "kl": 1.1000906713306904, "learning_rate": 9.999871246372492e-06, "loss": -0.0785, "num_tokens": 25591499.0, "reward": 0.3655930161476135, "reward_std": 2.7229840755462646, "rewards/rollout_reward_func/mean": 0.3655930161476135, "rewards/rollout_reward_func/std": 3.7974812984466553, "sampling/importance_sampling_ratio/max": 0.9700641632080078, "sampling/importance_sampling_ratio/mean": 0.5588679313659668, "sampling/importance_sampling_ratio/min": 5.451726146523147e-19, "sampling/sampling_logp_difference/max": 14.744916915893555, "sampling/sampling_logp_difference/mean": 0.46623849868774414, "step": 1091, "step_time": 10.48386881100123 }, { "clip_ratio/high_max": 0.013888888992369175, "clip_ratio/high_mean": 0.0069444444961845875, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0069444444961845875, "entropy": 1.9425694197416306, "epoch": 0.01092, "grad_norm": 0.11747217923402786, "kl": 1.0790102370083332, "learning_rate": 9.999871002175509e-06, "loss": -0.0785, "step": 1092, "step_time": 5.62179706599818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 6.0, "completions/max_terminated_length": 6.0, "completions/mean_length": 3.09375, "completions/mean_terminated_length": 3.09375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7306600138545036, "epoch": 0.01093, "frac_reward_zero_std": 0.25, "grad_norm": 0.24748241901397705, "kl": 0.7330057667568326, "learning_rate": 9.999870757747172e-06, "loss": -0.0264, "num_tokens": 25621788.0, "reward": 1.7031474113464355, "reward_std": 1.2873525619506836, "rewards/rollout_reward_func/mean": 1.7031474113464355, "rewards/rollout_reward_func/std": 3.2510290145874023, "sampling/importance_sampling_ratio/max": 0.976749837398529, "sampling/importance_sampling_ratio/mean": 0.6765742301940918, "sampling/importance_sampling_ratio/min": 9.97424599951367e-16, "sampling/sampling_logp_difference/max": 19.483169555664062, "sampling/sampling_logp_difference/mean": 0.5319679379463196, "step": 1093, "step_time": 5.460219809001501 }, { "clip_ratio/high_max": 0.0833333358168602, "clip_ratio/high_mean": 0.0416666679084301, "clip_ratio/low_mean": 0.0069444444961845875, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.04861111240461469, "entropy": 1.5733022950589657, "epoch": 0.01094, "grad_norm": 0.1657017469406128, "kl": 0.7111681774258614, "learning_rate": 9.999870513087484e-06, "loss": -0.0275, "step": 1094, "step_time": 3.1432637449997856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1738.0, "completions/max_terminated_length": 1738.0, "completions/mean_length": 406.96875, "completions/mean_terminated_length": 406.96875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.7246283292770386, "epoch": 0.01095, "frac_reward_zero_std": 0.25, "grad_norm": 0.23296304047107697, "kl": 0.942694503813982, "learning_rate": 9.999870268196444e-06, "loss": -0.0482, "num_tokens": 25669456.0, "reward": 2.3215560913085938, "reward_std": 2.4010791778564453, "rewards/rollout_reward_func/mean": 2.3215560913085938, "rewards/rollout_reward_func/std": 3.136056423187256, "sampling/importance_sampling_ratio/max": 0.9769006371498108, "sampling/importance_sampling_ratio/mean": 0.604333221912384, "sampling/importance_sampling_ratio/min": 0.0016533714951947331, "sampling/sampling_logp_difference/max": 2.3112683296203613, "sampling/sampling_logp_difference/mean": 0.20056577026844025, "step": 1095, "step_time": 12.256684062003842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.01736111124046147, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01736111124046147, "entropy": 1.6133785620331764, "epoch": 0.01096, "grad_norm": 0.2011348158121109, "kl": 0.931175519246608, "learning_rate": 9.999870023074054e-06, "loss": -0.0494, "step": 1096, "step_time": 7.041776359001233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1653.0, "completions/max_terminated_length": 1653.0, "completions/mean_length": 479.8125, "completions/mean_terminated_length": 479.8125, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 0.9917856231331825, "epoch": 0.01097, "frac_reward_zero_std": 0.25, "grad_norm": 0.4324752986431122, "kl": 0.7341181300580502, "learning_rate": 9.999869777720313e-06, "loss": -0.0412, "num_tokens": 25721481.0, "reward": 2.7302730083465576, "reward_std": 2.415513038635254, "rewards/rollout_reward_func/mean": 2.7302730083465576, "rewards/rollout_reward_func/std": 3.040954351425171, "sampling/importance_sampling_ratio/max": 0.9768392443656921, "sampling/importance_sampling_ratio/mean": 0.7019591927528381, "sampling/importance_sampling_ratio/min": 0.01188303716480732, "sampling/sampling_logp_difference/max": 1.5555974245071411, "sampling/sampling_logp_difference/mean": 0.1054496243596077, "step": 1097, "step_time": 12.998978726998757 }, { "clip_ratio/high_max": 0.028125000186264515, "clip_ratio/high_mean": 0.014062500093132257, "clip_ratio/low_mean": 0.011029412038624287, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.025091912131756544, "entropy": 0.9265770427882671, "epoch": 0.01098, "grad_norm": 0.35923337936401367, "kl": 0.7337204683572054, "learning_rate": 9.99986953213522e-06, "loss": -0.0429, "step": 1098, "step_time": 6.967077958001028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "completions/clipped_ratio": 0.0, "completions/max_length": 1334.0, "completions/max_terminated_length": 1334.0, "completions/mean_length": 458.46875, "completions/mean_terminated_length": 458.46875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.227195356041193, "epoch": 0.01099, "frac_reward_zero_std": 0.0, "grad_norm": 0.38442257046699524, "kl": 1.2641825079917908, "learning_rate": 9.999869286318774e-06, "loss": -0.0127, "num_tokens": 25774052.0, "reward": 0.7020962238311768, "reward_std": 1.5066670179367065, "rewards/rollout_reward_func/mean": 0.7020962238311768, "rewards/rollout_reward_func/std": 3.4788620471954346, "sampling/importance_sampling_ratio/max": 1.0414540767669678, "sampling/importance_sampling_ratio/mean": 0.6816283464431763, "sampling/importance_sampling_ratio/min": 0.022395310923457146, "sampling/sampling_logp_difference/max": 1.4738798141479492, "sampling/sampling_logp_difference/mean": 0.13094499707221985, "step": 1099, "step_time": 11.576223390999075 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.06314102606847882, "clip_ratio/low_min": 0.009615384973585606, "clip_ratio/region_mean": 0.06939102616161108, "entropy": 1.2214232794940472, "epoch": 0.011, "grad_norm": 0.18748922646045685, "kl": 1.254619311541319, "learning_rate": 9.999869040270978e-06, "loss": -0.0145, "step": 1100, "step_time": 6.1850072240031295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1516.0, "completions/max_terminated_length": 1516.0, "completions/mean_length": 404.28125, "completions/mean_terminated_length": 404.58062744140625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7823589332401752, "epoch": 0.01101, "frac_reward_zero_std": 0.25, "grad_norm": 0.18270239233970642, "kl": 1.0417690500617027, "learning_rate": 9.999868793991832e-06, "loss": -0.0109, "num_tokens": 25824003.0, "reward": 3.444805145263672, "reward_std": 1.4548585414886475, "rewards/rollout_reward_func/mean": 3.444805145263672, "rewards/rollout_reward_func/std": 2.5183136463165283, "sampling/importance_sampling_ratio/max": 0.9758738279342651, "sampling/importance_sampling_ratio/mean": 0.787788987159729, "sampling/importance_sampling_ratio/min": 8.648261768939847e-07, "sampling/sampling_logp_difference/max": 2.2800869941711426, "sampling/sampling_logp_difference/mean": 0.1153813898563385, "step": 1101, "step_time": 12.088386122000884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0027173913549631834, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0027173913549631834, "entropy": 0.7891221549361944, "epoch": 0.01102, "grad_norm": 0.20585034787654877, "kl": 1.042857457883656, "learning_rate": 9.999868547481334e-06, "loss": -0.0114, "step": 1102, "step_time": 6.508815962000881 }, { "clip_ratio/high_max": 0.010416666977107525, "clip_ratio/high_mean": 0.0052083334885537624, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0052083334885537624, "completions/clipped_ratio": 0.03125, "completions/max_length": 539.0, "completions/max_terminated_length": 539.0, "completions/mean_length": 140.34375, "completions/mean_terminated_length": 129.51612854003906, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.9170799255371094, "epoch": 0.01103, "frac_reward_zero_std": 0.5, "grad_norm": 0.14736545085906982, "kl": 0.7523350790143013, "learning_rate": 9.999868300739483e-06, "loss": -0.0092, "num_tokens": 25860885.0, "reward": 2.4768776893615723, "reward_std": 0.36888495087623596, "rewards/rollout_reward_func/mean": 2.4768776893615723, "rewards/rollout_reward_func/std": 2.9771792888641357, "sampling/importance_sampling_ratio/max": 1.0106279850006104, "sampling/importance_sampling_ratio/mean": 0.8136546611785889, "sampling/importance_sampling_ratio/min": 1.0688256679713959e-06, "sampling/sampling_logp_difference/max": 1.5167099237442017, "sampling/sampling_logp_difference/mean": 0.16420048475265503, "step": 1103, "step_time": 8.013850324996383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.013020833488553762, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.013020833488553762, "entropy": 0.9024640303105116, "epoch": 0.01104, "grad_norm": 0.16161546111106873, "kl": 0.7579840496182442, "learning_rate": 9.999868053766283e-06, "loss": -0.0091, "step": 1104, "step_time": 3.992261224999311 }, { "clip_ratio/high_max": 0.033333334140479565, "clip_ratio/high_mean": 0.016666667070239782, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.016666667070239782, "completions/clipped_ratio": 0.0, "completions/max_length": 1366.0, "completions/max_terminated_length": 1366.0, "completions/mean_length": 179.34375, "completions/mean_terminated_length": 179.34375, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8471609074622393, "epoch": 0.01105, "frac_reward_zero_std": 0.25, "grad_norm": 0.3367812931537628, "kl": 0.5772132771089673, "learning_rate": 9.999867806561732e-06, "loss": -0.0109, "num_tokens": 25901329.0, "reward": 0.7402280569076538, "reward_std": 1.7286090850830078, "rewards/rollout_reward_func/mean": 0.7402280569076538, "rewards/rollout_reward_func/std": 3.4840242862701416, "sampling/importance_sampling_ratio/max": 0.9860036373138428, "sampling/importance_sampling_ratio/mean": 0.8019219040870667, "sampling/importance_sampling_ratio/min": 6.97757627676765e-07, "sampling/sampling_logp_difference/max": 2.050743579864502, "sampling/sampling_logp_difference/mean": 0.1421617567539215, "step": 1105, "step_time": 10.0040568229997 }, { "clip_ratio/high_max": 0.054166668094694614, "clip_ratio/high_mean": 0.027083334047347307, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.027083334047347307, "entropy": 0.8445486817508936, "epoch": 0.01106, "grad_norm": 0.16632291674613953, "kl": 0.5729306368157268, "learning_rate": 9.999867559125829e-06, "loss": -0.0115, "step": 1106, "step_time": 6.008891757001038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 480.25, "completions/mean_terminated_length": 480.25, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4392934069037437, "epoch": 0.01107, "frac_reward_zero_std": 0.25, "grad_norm": 0.14930610358715057, "kl": 0.609428845345974, "learning_rate": 9.999867311458574e-06, "loss": -0.0301, "num_tokens": 25953868.0, "reward": 2.5081546306610107, "reward_std": 2.386207103729248, "rewards/rollout_reward_func/mean": 2.5081546306610107, "rewards/rollout_reward_func/std": 3.13838267326355, "sampling/importance_sampling_ratio/max": 0.9779512882232666, "sampling/importance_sampling_ratio/mean": 0.6116486191749573, "sampling/importance_sampling_ratio/min": 4.192590807861052e-08, "sampling/sampling_logp_difference/max": 8.854730606079102, "sampling/sampling_logp_difference/mean": 0.2662043273448944, "step": 1107, "step_time": 11.565728192004826 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.012500000186264515, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.018750000279396772, "entropy": 1.4398509860038757, "epoch": 0.01108, "grad_norm": 0.15627308189868927, "kl": 0.6388997305184603, "learning_rate": 9.999867063559969e-06, "loss": -0.0301, "step": 1108, "step_time": 6.174607239001489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 162.5, "completions/mean_terminated_length": 162.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7927671391516924, "epoch": 0.01109, "frac_reward_zero_std": 0.5, "grad_norm": 0.11205840110778809, "kl": 0.3746378542855382, "learning_rate": 9.999866815430012e-06, "loss": -0.0371, "num_tokens": 25994082.0, "reward": 2.6291091442108154, "reward_std": 1.3354582786560059, "rewards/rollout_reward_func/mean": 2.6291091442108154, "rewards/rollout_reward_func/std": 2.8743176460266113, "sampling/importance_sampling_ratio/max": 0.9769690036773682, "sampling/importance_sampling_ratio/mean": 0.863783597946167, "sampling/importance_sampling_ratio/min": 1.472297270765921e-07, "sampling/sampling_logp_difference/max": 2.011539936065674, "sampling/sampling_logp_difference/mean": 0.14731109142303467, "step": 1109, "step_time": 10.156904888999634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010416666977107525, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010416666977107525, "entropy": 0.8069647438824177, "epoch": 0.0111, "grad_norm": 0.09699476510286331, "kl": 0.38025594409555197, "learning_rate": 9.999866567068706e-06, "loss": -0.0375, "step": 1110, "step_time": 6.205936317999658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1246.0, "completions/max_terminated_length": 1246.0, "completions/mean_length": 361.40625, "completions/mean_terminated_length": 361.40625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.798273729160428, "epoch": 0.01111, "frac_reward_zero_std": 0.0, "grad_norm": 0.31353798508644104, "kl": 0.9721954949200153, "learning_rate": 9.999866318476046e-06, "loss": -0.0203, "num_tokens": 26042157.0, "reward": 3.0239670276641846, "reward_std": 2.1046626567840576, "rewards/rollout_reward_func/mean": 3.0239670276641846, "rewards/rollout_reward_func/std": 2.780261516571045, "sampling/importance_sampling_ratio/max": 0.9759023785591125, "sampling/importance_sampling_ratio/mean": 0.8409075736999512, "sampling/importance_sampling_ratio/min": 2.523636567275389e-06, "sampling/sampling_logp_difference/max": 1.865984320640564, "sampling/sampling_logp_difference/mean": 0.14303645491600037, "step": 1111, "step_time": 10.949243952996767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.010233918204903603, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.010233918204903603, "entropy": 0.7974626943469048, "epoch": 0.01112, "grad_norm": 0.09558124840259552, "kl": 0.9723746478557587, "learning_rate": 9.999866069652035e-06, "loss": -0.0211, "step": 1112, "step_time": 5.906419902001289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.002314814832061529, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.002314814832061529, "completions/clipped_ratio": 0.0, "completions/max_length": 1416.0, "completions/max_terminated_length": 1416.0, "completions/mean_length": 544.625, "completions/mean_terminated_length": 544.625, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4365136921405792, "epoch": 0.01113, "frac_reward_zero_std": 0.0, "grad_norm": 0.7270490527153015, "kl": 1.2038407027721405, "learning_rate": 9.999865820596674e-06, "loss": -0.0524, "num_tokens": 26098474.0, "reward": 2.0263867378234863, "reward_std": 2.5378215312957764, "rewards/rollout_reward_func/mean": 2.0263867378234863, "rewards/rollout_reward_func/std": 3.6011464595794678, "sampling/importance_sampling_ratio/max": 1.4670251607894897, "sampling/importance_sampling_ratio/mean": 0.6950713396072388, "sampling/importance_sampling_ratio/min": 1.5102204797247154e-20, "sampling/sampling_logp_difference/max": 15.963671684265137, "sampling/sampling_logp_difference/mean": 0.41226518154144287, "step": 1113, "step_time": 11.487535324000419 }, { "clip_ratio/high_max": 0.02350427396595478, "clip_ratio/high_mean": 0.01175213698297739, "clip_ratio/low_mean": 0.012626262847334146, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.024378399830311537, "entropy": 1.4217516519129276, "epoch": 0.01114, "grad_norm": 0.2661432921886444, "kl": 1.1976208537817001, "learning_rate": 9.999865571309964e-06, "loss": -0.0573, "step": 1114, "step_time": 6.818085699997027 }, { "clip_ratio/high_max": 0.01567398151382804, "clip_ratio/high_mean": 0.00783699075691402, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00783699075691402, "completions/clipped_ratio": 0.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 511.5, "completions/mean_terminated_length": 511.5, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.5709714591503143, "epoch": 0.01115, "frac_reward_zero_std": 0.0, "grad_norm": 0.2199317365884781, "kl": 1.1925251223146915, "learning_rate": 9.9998653217919e-06, "loss": -0.0238, "num_tokens": 26153196.0, "reward": 0.8750197887420654, "reward_std": 3.1156325340270996, "rewards/rollout_reward_func/mean": 0.8750197887420654, "rewards/rollout_reward_func/std": 3.395303249359131, "sampling/importance_sampling_ratio/max": 1.0107866525650024, "sampling/importance_sampling_ratio/mean": 0.6184989213943481, "sampling/importance_sampling_ratio/min": 5.35757133102166e-11, "sampling/sampling_logp_difference/max": 2.7937350273132324, "sampling/sampling_logp_difference/mean": 0.282469242811203, "step": 1115, "step_time": 10.750396377998186 }, { "clip_ratio/high_max": 0.01567398151382804, "clip_ratio/high_mean": 0.00783699075691402, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.00783699075691402, "entropy": 1.5616351217031479, "epoch": 0.01116, "grad_norm": 0.23544175922870636, "kl": 1.1957936435937881, "learning_rate": 9.999865072042485e-06, "loss": -0.024, "step": 1116, "step_time": 6.2794209310013684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0024999999441206455, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0024999999441206455, "completions/clipped_ratio": 0.03125, "completions/max_length": 1203.0, "completions/max_terminated_length": 1203.0, "completions/mean_length": 429.40625, "completions/mean_terminated_length": 426.1290283203125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 1.4667610973119736, "epoch": 0.01117, "frac_reward_zero_std": 0.0, "grad_norm": 0.2858284115791321, "kl": 1.2395273372530937, "learning_rate": 9.999864822061722e-06, "loss": -0.0357, "num_tokens": 26204357.0, "reward": 0.47955822944641113, "reward_std": 2.434410333633423, "rewards/rollout_reward_func/mean": 0.47955822944641113, "rewards/rollout_reward_func/std": 3.6084368228912354, "sampling/importance_sampling_ratio/max": 1.0608336925506592, "sampling/importance_sampling_ratio/mean": 0.6139628291130066, "sampling/importance_sampling_ratio/min": 8.200460540383047e-16, "sampling/sampling_logp_difference/max": 15.459783554077148, "sampling/sampling_logp_difference/mean": 0.38201943039894104, "step": 1117, "step_time": 10.565296931998091 }, { "clip_ratio/high_max": 0.01923076994717121, "clip_ratio/high_mean": 0.009615384973585606, "clip_ratio/low_mean": 0.008181818295270205, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.01779720326885581, "entropy": 1.4615815728902817, "epoch": 0.01118, "grad_norm": 0.20268800854682922, "kl": 1.2184313759207726, "learning_rate": 9.999864571849604e-06, "loss": -0.0362, "step": 1118, "step_time": 6.284622446999492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.008342391345649958, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.008342391345649958, "completions/clipped_ratio": 0.0625, "completions/max_length": 1442.0, "completions/max_terminated_length": 1442.0, "completions/mean_length": 513.75, "completions/mean_terminated_length": 512.1333618164062, "completions/min_length": 2.0, "completions/min_terminated_length": 2.0, "entropy": 1.389377161860466, "epoch": 0.01119, "frac_reward_zero_std": 0.0, "grad_norm": 0.10700110346078873, "kl": 0.8695030510425568, "learning_rate": 9.999864321406137e-06, "loss": -0.0262, "num_tokens": 26259342.0, "reward": 2.1270503997802734, "reward_std": 2.29775333404541, "rewards/rollout_reward_func/mean": 2.1270503997802734, "rewards/rollout_reward_func/std": 3.2985854148864746, "sampling/importance_sampling_ratio/max": 0.9894829392433167, "sampling/importance_sampling_ratio/mean": 0.6604461669921875, "sampling/importance_sampling_ratio/min": 2.3490139938076027e-05, "sampling/sampling_logp_difference/max": 2.499014139175415, "sampling/sampling_logp_difference/mean": 0.2369794398546219, "step": 1119, "step_time": 11.518792304001181 }, { "clip_ratio/high_max": 0.012500000186264515, "clip_ratio/high_mean": 0.0062500000931322575, "clip_ratio/low_mean": 0.005434782709926367, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.011684782803058624, "entropy": 1.384571798145771, "epoch": 0.0112, "grad_norm": 0.10745232552289963, "kl": 0.8577835708856583, "learning_rate": 9.99986407073132e-06, "loss": -0.0264, "step": 1120, "step_time": 6.865446264002458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 895.0, "completions/max_terminated_length": 895.0, "completions/mean_length": 170.53125, "completions/mean_terminated_length": 170.53125, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.7165241874754429, "epoch": 0.01121, "frac_reward_zero_std": 0.25, "grad_norm": 0.12697115540504456, "kl": 0.8944777213037014, "learning_rate": 9.99986381982515e-06, "loss": -0.0387, "num_tokens": 26300378.0, "reward": 0.682511568069458, "reward_std": 0.1860191822052002, "rewards/rollout_reward_func/mean": 0.682511568069458, "rewards/rollout_reward_func/std": 3.1297895908355713, "sampling/importance_sampling_ratio/max": 0.9776239991188049, "sampling/importance_sampling_ratio/mean": 0.8395912647247314, "sampling/importance_sampling_ratio/min": 0.04549873620271683, "sampling/sampling_logp_difference/max": 2.096187114715576, "sampling/sampling_logp_difference/mean": 0.08460790663957596, "step": 1121, "step_time": 9.00169418799851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.005681818351149559, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.005681818351149559, "entropy": 0.7135546952486038, "epoch": 0.01122, "grad_norm": 0.06731804460287094, "kl": 0.8893876448273659, "learning_rate": 9.99986356868763e-06, "loss": -0.0388, "step": 1122, "step_time": 5.098914369998965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.03125, "completions/max_length": 1360.0, "completions/max_terminated_length": 1360.0, "completions/mean_length": 361.0625, "completions/mean_terminated_length": 333.8709716796875, "completions/min_length": 3.0, "completions/min_terminated_length": 3.0, "entropy": 0.8289035130292177, "epoch": 0.01123, "frac_reward_zero_std": 0.25, "grad_norm": 0.11629357188940048, "kl": 0.7325576543807983, "learning_rate": 9.999863317318758e-06, "loss": -0.0254, "num_tokens": 26347871.0, "reward": 0.9698797464370728, "reward_std": 1.9315208196640015, "rewards/rollout_reward_func/mean": 0.9698797464370728, "rewards/rollout_reward_func/std": 3.368046522140503, "sampling/importance_sampling_ratio/max": 1.0178104639053345, "sampling/importance_sampling_ratio/mean": 0.7986239194869995, "sampling/importance_sampling_ratio/min": 0.0009169862605631351, "sampling/sampling_logp_difference/max": 2.7377750873565674, "sampling/sampling_logp_difference/mean": 0.11485858261585236, "step": 1123, "step_time": 11.556461060998117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.004807692486792803, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.004807692486792803, "entropy": 0.8270824160426855, "epoch": 0.01124, "grad_norm": 0.11238475143909454, "kl": 0.7369669899344444, "learning_rate": 9.999863065718538e-06, "loss": -0.0255, "step": 1124, "step_time": 6.569485143998463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.0, "completions/max_length": 1164.0, "completions/max_terminated_length": 1164.0, "completions/mean_length": 587.46875, "completions/mean_terminated_length": 587.46875, "completions/min_length": 308.0, "completions/min_terminated_length": 308.0, "entropy": 1.349198505282402, "epoch": 0.01125, "frac_reward_zero_std": 0.0, "grad_norm": 0.21005646884441376, "kl": 1.2791283056139946, "learning_rate": 9.999862813886966e-06, "loss": -0.0418, "num_tokens": 26405672.0, "reward": 1.5818477869033813, "reward_std": 2.595496654510498, "rewards/rollout_reward_func/mean": 1.5818477869033813, "rewards/rollout_reward_func/std": 3.6299643516540527, "sampling/importance_sampling_ratio/max": 1.1311883926391602, "sampling/importance_sampling_ratio/mean": 0.5967539548873901, "sampling/importance_sampling_ratio/min": 0.04081001505255699, "sampling/sampling_logp_difference/max": 2.1310977935791016, "sampling/sampling_logp_difference/mean": 0.1633959412574768, "step": 1125, "step_time": 10.641362597003535 } ], "logging_steps": 1.0, "max_steps": 400000, "num_input_tokens_seen": 26405672, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 2, "trial_name": null, "trial_params": null }