| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.032520325203252, |
| "eval_steps": 500, |
| "global_step": 250, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14397.0, |
| "completions/max_terminated_length": 14397.0, |
| "completions/mean_length": 10168.53125, |
| "completions/mean_terminated_length": 10168.53125, |
| "completions/min_length": 5260.0, |
| "completions/min_terminated_length": 5260.0, |
| "entropy": 0.2622494325041771, |
| "epoch": 0.008130081300813009, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3732398450374603, |
| "learning_rate": 1e-06, |
| "loss": -0.0544, |
| "num_tokens": 346273.0, |
| "reward": 0.670829176902771, |
| "reward_std": 0.5900986194610596, |
| "rewards/reward_func/mean": 0.670829176902771, |
| "rewards/reward_func/std": 0.5900986194610596, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9909619092941284, |
| "sampling/importance_sampling_ratio/min": 0.061016567051410675, |
| "sampling/sampling_logp_difference/max": 2.796609878540039, |
| "sampling/sampling_logp_difference/mean": 0.018657810986042023, |
| "step": 1, |
| "step_time": 415.91905756667256 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13889.0, |
| "completions/max_terminated_length": 13889.0, |
| "completions/mean_length": 8686.625, |
| "completions/mean_terminated_length": 8686.625, |
| "completions/min_length": 3608.0, |
| "completions/min_terminated_length": 3608.0, |
| "entropy": 0.30042469687759876, |
| "epoch": 0.016260162601626018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42703714966773987, |
| "learning_rate": 9.959999999999999e-07, |
| "loss": -0.017, |
| "num_tokens": 637157.0, |
| "reward": 0.3524366021156311, |
| "reward_std": 0.5026865601539612, |
| "rewards/reward_func/mean": 0.3524366021156311, |
| "rewards/reward_func/std": 0.5026865601539612, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.337921380996704, |
| "sampling/importance_sampling_ratio/mean": 0.9895248413085938, |
| "sampling/importance_sampling_ratio/min": 0.2977469265460968, |
| "sampling/sampling_logp_difference/max": 1.2115113735198975, |
| "sampling/sampling_logp_difference/mean": 0.021018292754888535, |
| "step": 2, |
| "step_time": 367.433393279789 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16789.0, |
| "completions/max_terminated_length": 16789.0, |
| "completions/mean_length": 8983.875, |
| "completions/mean_terminated_length": 8983.875, |
| "completions/min_length": 1247.0, |
| "completions/min_terminated_length": 1247.0, |
| "entropy": 0.28624483197927475, |
| "epoch": 0.024390243902439025, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42274078726768494, |
| "learning_rate": 9.92e-07, |
| "loss": -0.01, |
| "num_tokens": 942513.0, |
| "reward": 1.4242839813232422, |
| "reward_std": 2.6354684829711914, |
| "rewards/reward_func/mean": 1.4273738861083984, |
| "rewards/reward_func/std": 2.6337430477142334, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00308990478515625, |
| "rewards/soft_overlong_punishment_reward/std": 0.017479142174124718, |
| "sampling/importance_sampling_ratio/max": 1.82925283908844, |
| "sampling/importance_sampling_ratio/mean": 0.9905841946601868, |
| "sampling/importance_sampling_ratio/min": 0.010209716856479645, |
| "sampling/sampling_logp_difference/max": 4.584415435791016, |
| "sampling/sampling_logp_difference/mean": 0.019620321691036224, |
| "step": 3, |
| "step_time": 357.44491101126187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15431.0, |
| "completions/max_terminated_length": 15431.0, |
| "completions/mean_length": 10926.4375, |
| "completions/mean_terminated_length": 10926.4375, |
| "completions/min_length": 6775.0, |
| "completions/min_terminated_length": 6775.0, |
| "entropy": 0.26211014203727245, |
| "epoch": 0.032520325203252036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43384817242622375, |
| "learning_rate": 9.88e-07, |
| "loss": -0.0053, |
| "num_tokens": 1312175.0, |
| "reward": 0.4805724620819092, |
| "reward_std": 0.5330014824867249, |
| "rewards/reward_func/mean": 0.4805724620819092, |
| "rewards/reward_func/std": 0.5330014824867249, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.227057933807373, |
| "sampling/importance_sampling_ratio/mean": 0.990934431552887, |
| "sampling/importance_sampling_ratio/min": 0.023468947038054466, |
| "sampling/sampling_logp_difference/max": 3.752077102661133, |
| "sampling/sampling_logp_difference/mean": 0.018750693649053574, |
| "step": 4, |
| "step_time": 371.1139560753945 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16857.0, |
| "completions/max_terminated_length": 16857.0, |
| "completions/mean_length": 8058.5625, |
| "completions/mean_terminated_length": 8058.5625, |
| "completions/min_length": 3038.0, |
| "completions/min_terminated_length": 3038.0, |
| "entropy": 0.2765342816710472, |
| "epoch": 0.04065040650406504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43143269419670105, |
| "learning_rate": 9.84e-07, |
| "loss": 0.0181, |
| "num_tokens": 1585833.0, |
| "reward": 1.0776739120483398, |
| "reward_std": 0.8259380459785461, |
| "rewards/reward_func/mean": 1.081282615661621, |
| "rewards/reward_func/std": 0.8210198283195496, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00360870361328125, |
| "rewards/soft_overlong_punishment_reward/std": 0.02041390910744667, |
| "sampling/importance_sampling_ratio/max": 2.1974010467529297, |
| "sampling/importance_sampling_ratio/mean": 0.9910835027694702, |
| "sampling/importance_sampling_ratio/min": 0.31773900985717773, |
| "sampling/sampling_logp_difference/max": 1.1465249061584473, |
| "sampling/sampling_logp_difference/mean": 0.018462253734469414, |
| "step": 5, |
| "step_time": 336.172940433491 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15287.0, |
| "completions/max_terminated_length": 15287.0, |
| "completions/mean_length": 9501.75, |
| "completions/mean_terminated_length": 9501.75, |
| "completions/min_length": 4768.0, |
| "completions/min_terminated_length": 4768.0, |
| "entropy": 0.2606150833889842, |
| "epoch": 0.04878048780487805, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.367913156747818, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0207, |
| "num_tokens": 1917425.0, |
| "reward": 0.7828472852706909, |
| "reward_std": 0.7593443989753723, |
| "rewards/reward_func/mean": 0.7828472852706909, |
| "rewards/reward_func/std": 0.7593443989753723, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.830876588821411, |
| "sampling/importance_sampling_ratio/mean": 0.9910170435905457, |
| "sampling/importance_sampling_ratio/min": 0.05133574455976486, |
| "sampling/sampling_logp_difference/max": 2.9693679809570312, |
| "sampling/sampling_logp_difference/mean": 0.01832835003733635, |
| "step": 6, |
| "step_time": 453.13167459354736 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16987.0, |
| "completions/max_terminated_length": 16987.0, |
| "completions/mean_length": 11730.90625, |
| "completions/mean_terminated_length": 11730.90625, |
| "completions/min_length": 6346.0, |
| "completions/min_terminated_length": 6346.0, |
| "entropy": 0.2580571649596095, |
| "epoch": 0.056910569105691054, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3932175934314728, |
| "learning_rate": 9.759999999999998e-07, |
| "loss": 0.0595, |
| "num_tokens": 2308110.0, |
| "reward": 0.4529891014099121, |
| "reward_std": 0.5422401428222656, |
| "rewards/reward_func/mean": 0.4602828025817871, |
| "rewards/reward_func/std": 0.5356888175010681, |
| "rewards/soft_overlong_punishment_reward/mean": -0.007293701171875, |
| "rewards/soft_overlong_punishment_reward/std": 0.029728731140494347, |
| "sampling/importance_sampling_ratio/max": 2.425518274307251, |
| "sampling/importance_sampling_ratio/mean": 0.9910261034965515, |
| "sampling/importance_sampling_ratio/min": 0.00816231407225132, |
| "sampling/sampling_logp_difference/max": 4.8082275390625, |
| "sampling/sampling_logp_difference/mean": 0.018817249685525894, |
| "step": 7, |
| "step_time": 420.06239356310107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13961.0, |
| "completions/max_terminated_length": 13961.0, |
| "completions/mean_length": 10841.65625, |
| "completions/mean_terminated_length": 10841.65625, |
| "completions/min_length": 5514.0, |
| "completions/min_terminated_length": 5514.0, |
| "entropy": 0.2597746094688773, |
| "epoch": 0.06504065040650407, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3083711266517639, |
| "learning_rate": 9.72e-07, |
| "loss": 0.062, |
| "num_tokens": 2676595.0, |
| "reward": 0.3495897352695465, |
| "reward_std": 0.7178114652633667, |
| "rewards/reward_func/mean": 0.3495897352695465, |
| "rewards/reward_func/std": 0.7178115248680115, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9910988807678223, |
| "sampling/importance_sampling_ratio/min": 0.1719096601009369, |
| "sampling/sampling_logp_difference/max": 1.7607861757278442, |
| "sampling/sampling_logp_difference/mean": 0.01827491819858551, |
| "step": 8, |
| "step_time": 413.52012689388357 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16061.0, |
| "completions/max_terminated_length": 16061.0, |
| "completions/mean_length": 11138.6875, |
| "completions/mean_terminated_length": 11138.6875, |
| "completions/min_length": 5675.0, |
| "completions/min_terminated_length": 5675.0, |
| "entropy": 0.2899176850914955, |
| "epoch": 0.07317073170731707, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36722126603126526, |
| "learning_rate": 9.679999999999999e-07, |
| "loss": 0.0229, |
| "num_tokens": 3048729.0, |
| "reward": 0.39484167098999023, |
| "reward_std": 0.5577594637870789, |
| "rewards/reward_func/mean": 0.39484167098999023, |
| "rewards/reward_func/std": 0.5577594637870789, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7904616594314575, |
| "sampling/importance_sampling_ratio/mean": 0.989971399307251, |
| "sampling/importance_sampling_ratio/min": 0.26559245586395264, |
| "sampling/sampling_logp_difference/max": 1.3257923126220703, |
| "sampling/sampling_logp_difference/mean": 0.020297091454267502, |
| "step": 9, |
| "step_time": 361.87901354860514 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15494.0, |
| "completions/max_terminated_length": 15494.0, |
| "completions/mean_length": 11128.1875, |
| "completions/mean_terminated_length": 11128.1875, |
| "completions/min_length": 8007.0, |
| "completions/min_terminated_length": 8007.0, |
| "entropy": 0.289524232968688, |
| "epoch": 0.08130081300813008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3917076289653778, |
| "learning_rate": 9.64e-07, |
| "loss": -0.0053, |
| "num_tokens": 3421799.0, |
| "reward": 1.557268738746643, |
| "reward_std": 4.395501613616943, |
| "rewards/reward_func/mean": 1.557268738746643, |
| "rewards/reward_func/std": 4.395502090454102, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9057838916778564, |
| "sampling/importance_sampling_ratio/mean": 0.9899657964706421, |
| "sampling/importance_sampling_ratio/min": 0.00011426152923377231, |
| "sampling/sampling_logp_difference/max": 9.077020645141602, |
| "sampling/sampling_logp_difference/mean": 0.02046096697449684, |
| "step": 10, |
| "step_time": 530.6975831072778 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16418.0, |
| "completions/max_terminated_length": 16418.0, |
| "completions/mean_length": 11667.96875, |
| "completions/mean_terminated_length": 11667.96875, |
| "completions/min_length": 8577.0, |
| "completions/min_terminated_length": 8577.0, |
| "entropy": 0.25299404561519623, |
| "epoch": 0.08943089430894309, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 7.989307880401611, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0268, |
| "num_tokens": 3815326.0, |
| "reward": 0.4127262830734253, |
| "reward_std": 0.5275108814239502, |
| "rewards/reward_func/mean": 0.4129856824874878, |
| "rewards/reward_func/std": 0.5273244976997375, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0002593994140625, |
| "rewards/soft_overlong_punishment_reward/std": 0.0014673846308141947, |
| "sampling/importance_sampling_ratio/max": 2.6564552783966064, |
| "sampling/importance_sampling_ratio/mean": 0.9911899566650391, |
| "sampling/importance_sampling_ratio/min": 0.042087722569704056, |
| "sampling/sampling_logp_difference/max": 3.167999267578125, |
| "sampling/sampling_logp_difference/mean": 0.01853198930621147, |
| "step": 11, |
| "step_time": 455.64031926658936 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15432.0, |
| "completions/max_terminated_length": 15432.0, |
| "completions/mean_length": 10867.875, |
| "completions/mean_terminated_length": 10867.875, |
| "completions/min_length": 7317.0, |
| "completions/min_terminated_length": 7317.0, |
| "entropy": 0.26554491464048624, |
| "epoch": 0.0975609756097561, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35505810379981995, |
| "learning_rate": 9.559999999999998e-07, |
| "loss": 0.0226, |
| "num_tokens": 4184850.0, |
| "reward": 0.47400495409965515, |
| "reward_std": 0.48420941829681396, |
| "rewards/reward_func/mean": 0.47400495409965515, |
| "rewards/reward_func/std": 0.4842093884944916, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1569271087646484, |
| "sampling/importance_sampling_ratio/mean": 0.9906758069992065, |
| "sampling/importance_sampling_ratio/min": 0.01572277396917343, |
| "sampling/sampling_logp_difference/max": 4.152645111083984, |
| "sampling/sampling_logp_difference/mean": 0.019163597375154495, |
| "step": 12, |
| "step_time": 359.7980523931328 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 19330.0, |
| "completions/max_terminated_length": 19330.0, |
| "completions/mean_length": 11394.46875, |
| "completions/mean_terminated_length": 11394.46875, |
| "completions/min_length": 6975.0, |
| "completions/min_terminated_length": 6975.0, |
| "entropy": 0.26600847486406565, |
| "epoch": 0.10569105691056911, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3638916611671448, |
| "learning_rate": 9.52e-07, |
| "loss": 0.0105, |
| "num_tokens": 4567681.0, |
| "reward": 0.6445801258087158, |
| "reward_std": 0.9351080656051636, |
| "rewards/reward_func/mean": 0.6834137439727783, |
| "rewards/reward_func/std": 0.8933014273643494, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0388336181640625, |
| "rewards/soft_overlong_punishment_reward/std": 0.15481862425804138, |
| "sampling/importance_sampling_ratio/max": 2.923246383666992, |
| "sampling/importance_sampling_ratio/mean": 0.9907053112983704, |
| "sampling/importance_sampling_ratio/min": 0.09609930962324142, |
| "sampling/sampling_logp_difference/max": 2.3423731327056885, |
| "sampling/sampling_logp_difference/mean": 0.0187990739941597, |
| "step": 13, |
| "step_time": 594.1962707811035 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12979.0, |
| "completions/max_terminated_length": 12979.0, |
| "completions/mean_length": 7971.6875, |
| "completions/mean_terminated_length": 7971.6875, |
| "completions/min_length": 3552.0, |
| "completions/min_terminated_length": 3552.0, |
| "entropy": 0.3081512898206711, |
| "epoch": 0.11382113821138211, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46477997303009033, |
| "learning_rate": 9.479999999999999e-07, |
| "loss": 0.0457, |
| "num_tokens": 4830719.0, |
| "reward": 0.9467858076095581, |
| "reward_std": 0.8943569660186768, |
| "rewards/reward_func/mean": 0.9467858076095581, |
| "rewards/reward_func/std": 0.8943569660186768, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9614217281341553, |
| "sampling/importance_sampling_ratio/mean": 0.989205002784729, |
| "sampling/importance_sampling_ratio/min": 0.0001446620444767177, |
| "sampling/sampling_logp_difference/max": 8.841110229492188, |
| "sampling/sampling_logp_difference/mean": 0.021643634885549545, |
| "step": 14, |
| "step_time": 289.7629952353891 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16237.0, |
| "completions/max_terminated_length": 16237.0, |
| "completions/mean_length": 10616.90625, |
| "completions/mean_terminated_length": 10616.90625, |
| "completions/min_length": 7285.0, |
| "completions/min_terminated_length": 7285.0, |
| "entropy": 0.293476989492774, |
| "epoch": 0.12195121951219512, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3079313635826111, |
| "learning_rate": 9.439999999999999e-07, |
| "loss": -0.0016, |
| "num_tokens": 5184964.0, |
| "reward": 0.4604455828666687, |
| "reward_std": 0.6293958425521851, |
| "rewards/reward_func/mean": 0.4604455828666687, |
| "rewards/reward_func/std": 0.6293958425521851, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9898009300231934, |
| "sampling/importance_sampling_ratio/min": 0.18817251920700073, |
| "sampling/sampling_logp_difference/max": 1.670396089553833, |
| "sampling/sampling_logp_difference/mean": 0.020975295454263687, |
| "step": 15, |
| "step_time": 446.5528327494394 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13793.0, |
| "completions/max_terminated_length": 13793.0, |
| "completions/mean_length": 10209.96875, |
| "completions/mean_terminated_length": 10209.96875, |
| "completions/min_length": 7867.0, |
| "completions/min_terminated_length": 7867.0, |
| "entropy": 0.26629081927239895, |
| "epoch": 0.13008130081300814, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.375059574842453, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": -0.0079, |
| "num_tokens": 5530355.0, |
| "reward": 0.5800999999046326, |
| "reward_std": 0.6380655169487, |
| "rewards/reward_func/mean": 0.5800999999046326, |
| "rewards/reward_func/std": 0.6380655765533447, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9908578991889954, |
| "sampling/importance_sampling_ratio/min": 0.24528150260448456, |
| "sampling/sampling_logp_difference/max": 1.405348777770996, |
| "sampling/sampling_logp_difference/mean": 0.019185448065400124, |
| "step": 16, |
| "step_time": 322.7766472310759 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14345.0, |
| "completions/max_terminated_length": 14345.0, |
| "completions/mean_length": 10214.875, |
| "completions/mean_terminated_length": 10214.875, |
| "completions/min_length": 8254.0, |
| "completions/min_terminated_length": 8254.0, |
| "entropy": 0.28936258889734745, |
| "epoch": 0.13821138211382114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39189663529396057, |
| "learning_rate": 9.36e-07, |
| "loss": -0.01, |
| "num_tokens": 5868399.0, |
| "reward": 0.7184683680534363, |
| "reward_std": 1.3543201684951782, |
| "rewards/reward_func/mean": 0.7184683680534363, |
| "rewards/reward_func/std": 1.3543201684951782, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9899466037750244, |
| "sampling/importance_sampling_ratio/min": 0.14265097677707672, |
| "sampling/sampling_logp_difference/max": 1.9473543167114258, |
| "sampling/sampling_logp_difference/mean": 0.02051045559346676, |
| "step": 17, |
| "step_time": 332.0521780475974 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15805.0, |
| "completions/max_terminated_length": 15805.0, |
| "completions/mean_length": 12219.53125, |
| "completions/mean_terminated_length": 12219.53125, |
| "completions/min_length": 6227.0, |
| "completions/min_terminated_length": 6227.0, |
| "entropy": 0.24262672010809183, |
| "epoch": 0.14634146341463414, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36067014932632446, |
| "learning_rate": 9.32e-07, |
| "loss": -0.0058, |
| "num_tokens": 6284928.0, |
| "reward": 0.4509197473526001, |
| "reward_std": 0.5910488963127136, |
| "rewards/reward_func/mean": 0.4509197473526001, |
| "rewards/reward_func/std": 0.5910489559173584, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.732221841812134, |
| "sampling/importance_sampling_ratio/mean": 0.9915755987167358, |
| "sampling/importance_sampling_ratio/min": 0.08208515495061874, |
| "sampling/sampling_logp_difference/max": 2.499998092651367, |
| "sampling/sampling_logp_difference/mean": 0.017901983112096786, |
| "step": 18, |
| "step_time": 481.0771376036573 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14306.0, |
| "completions/max_terminated_length": 14306.0, |
| "completions/mean_length": 11337.0625, |
| "completions/mean_terminated_length": 11337.0625, |
| "completions/min_length": 7568.0, |
| "completions/min_terminated_length": 7568.0, |
| "entropy": 0.2646152526140213, |
| "epoch": 0.15447154471544716, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36779990792274475, |
| "learning_rate": 9.28e-07, |
| "loss": -0.0053, |
| "num_tokens": 6675994.0, |
| "reward": 0.4712689220905304, |
| "reward_std": 0.6008197665214539, |
| "rewards/reward_func/mean": 0.4712689220905304, |
| "rewards/reward_func/std": 0.6008197069168091, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.159226179122925, |
| "sampling/importance_sampling_ratio/mean": 0.990673303604126, |
| "sampling/importance_sampling_ratio/min": 0.13695114850997925, |
| "sampling/sampling_logp_difference/max": 1.988131046295166, |
| "sampling/sampling_logp_difference/mean": 0.019125521183013916, |
| "step": 19, |
| "step_time": 353.8724719102029 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14549.0, |
| "completions/max_terminated_length": 14549.0, |
| "completions/mean_length": 10831.9375, |
| "completions/mean_terminated_length": 10831.9375, |
| "completions/min_length": 7563.0, |
| "completions/min_terminated_length": 7563.0, |
| "entropy": 0.2252928400412202, |
| "epoch": 0.16260162601626016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3673231899738312, |
| "learning_rate": 9.24e-07, |
| "loss": 0.0035, |
| "num_tokens": 7065000.0, |
| "reward": 0.3244926333427429, |
| "reward_std": 0.3530224561691284, |
| "rewards/reward_func/mean": 0.3244926333427429, |
| "rewards/reward_func/std": 0.3530224561691284, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0267248153686523, |
| "sampling/importance_sampling_ratio/mean": 0.9923210740089417, |
| "sampling/importance_sampling_ratio/min": 0.01011732593178749, |
| "sampling/sampling_logp_difference/max": 4.593505859375, |
| "sampling/sampling_logp_difference/mean": 0.016302792355418205, |
| "step": 20, |
| "step_time": 574.1707554678433 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16370.0, |
| "completions/max_terminated_length": 16370.0, |
| "completions/mean_length": 10895.40625, |
| "completions/mean_terminated_length": 10895.40625, |
| "completions/min_length": 6243.0, |
| "completions/min_terminated_length": 6243.0, |
| "entropy": 0.28782651759684086, |
| "epoch": 0.17073170731707318, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3720740079879761, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0136, |
| "num_tokens": 7431013.0, |
| "reward": 0.9185539484024048, |
| "reward_std": 0.8305948376655579, |
| "rewards/reward_func/mean": 0.9185539484024048, |
| "rewards/reward_func/std": 0.8305947780609131, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.346141815185547, |
| "sampling/importance_sampling_ratio/mean": 0.9899004697799683, |
| "sampling/importance_sampling_ratio/min": 0.1588689088821411, |
| "sampling/sampling_logp_difference/max": 1.8396759033203125, |
| "sampling/sampling_logp_difference/mean": 0.020509924739599228, |
| "step": 21, |
| "step_time": 422.31716467044316 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13981.0, |
| "completions/max_terminated_length": 13981.0, |
| "completions/mean_length": 10807.5625, |
| "completions/mean_terminated_length": 10807.5625, |
| "completions/min_length": 8205.0, |
| "completions/min_terminated_length": 8205.0, |
| "entropy": 0.24795905128121376, |
| "epoch": 0.17886178861788618, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3784899115562439, |
| "learning_rate": 9.16e-07, |
| "loss": 0.002, |
| "num_tokens": 7802807.0, |
| "reward": 0.5439961552619934, |
| "reward_std": 0.6093953251838684, |
| "rewards/reward_func/mean": 0.5439961552619934, |
| "rewards/reward_func/std": 0.6093953251838684, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9913806915283203, |
| "sampling/importance_sampling_ratio/min": 0.07173754274845123, |
| "sampling/sampling_logp_difference/max": 2.6347410678863525, |
| "sampling/sampling_logp_difference/mean": 0.01798321306705475, |
| "step": 22, |
| "step_time": 554.0859813888092 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15253.0, |
| "completions/max_terminated_length": 15253.0, |
| "completions/mean_length": 10385.09375, |
| "completions/mean_terminated_length": 10385.09375, |
| "completions/min_length": 6117.0, |
| "completions/min_terminated_length": 6117.0, |
| "entropy": 0.2895797435194254, |
| "epoch": 0.18699186991869918, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3935360014438629, |
| "learning_rate": 9.12e-07, |
| "loss": 0.0286, |
| "num_tokens": 8154338.0, |
| "reward": 0.690646767616272, |
| "reward_std": 0.689028263092041, |
| "rewards/reward_func/mean": 0.690646767616272, |
| "rewards/reward_func/std": 0.6890282034873962, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2023396492004395, |
| "sampling/importance_sampling_ratio/mean": 0.9899480938911438, |
| "sampling/importance_sampling_ratio/min": 0.07151451706886292, |
| "sampling/sampling_logp_difference/max": 2.637854814529419, |
| "sampling/sampling_logp_difference/mean": 0.020658794790506363, |
| "step": 23, |
| "step_time": 574.1298945220187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15523.0, |
| "completions/max_terminated_length": 15523.0, |
| "completions/mean_length": 9569.84375, |
| "completions/mean_terminated_length": 9569.84375, |
| "completions/min_length": 3304.0, |
| "completions/min_terminated_length": 3304.0, |
| "entropy": 0.30353074334561825, |
| "epoch": 0.1951219512195122, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41558295488357544, |
| "learning_rate": 9.08e-07, |
| "loss": -0.0139, |
| "num_tokens": 8474285.0, |
| "reward": 0.6444994807243347, |
| "reward_std": 0.611183762550354, |
| "rewards/reward_func/mean": 0.6444994807243347, |
| "rewards/reward_func/std": 0.611183762550354, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9894601702690125, |
| "sampling/importance_sampling_ratio/min": 0.27406805753707886, |
| "sampling/sampling_logp_difference/max": 1.6113767623901367, |
| "sampling/sampling_logp_difference/mean": 0.021533746272325516, |
| "step": 24, |
| "step_time": 392.5540506092366 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17367.0, |
| "completions/max_terminated_length": 17367.0, |
| "completions/mean_length": 10313.59375, |
| "completions/mean_terminated_length": 10313.59375, |
| "completions/min_length": 5453.0, |
| "completions/min_terminated_length": 5453.0, |
| "entropy": 0.28294676542282104, |
| "epoch": 0.2032520325203252, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37816983461380005, |
| "learning_rate": 9.039999999999999e-07, |
| "loss": -0.0331, |
| "num_tokens": 8821040.0, |
| "reward": 0.7799074649810791, |
| "reward_std": 0.7231826782226562, |
| "rewards/reward_func/mean": 0.7874071598052979, |
| "rewards/reward_func/std": 0.7279127836227417, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00749969482421875, |
| "rewards/soft_overlong_punishment_reward/std": 0.042424678802490234, |
| "sampling/importance_sampling_ratio/max": 2.6919732093811035, |
| "sampling/importance_sampling_ratio/mean": 0.99010169506073, |
| "sampling/importance_sampling_ratio/min": 0.24937866628170013, |
| "sampling/sampling_logp_difference/max": 1.3887828588485718, |
| "sampling/sampling_logp_difference/mean": 0.020352153107523918, |
| "step": 25, |
| "step_time": 420.245932768099 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 13815.0, |
| "completions/mean_length": 10411.9375, |
| "completions/mean_terminated_length": 10087.1611328125, |
| "completions/min_length": 7108.0, |
| "completions/min_terminated_length": 7108.0, |
| "entropy": 0.25935694947838783, |
| "epoch": 0.21138211382113822, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34230929613113403, |
| "learning_rate": 9e-07, |
| "loss": -0.0888, |
| "num_tokens": 9173350.0, |
| "reward": 0.48251837491989136, |
| "reward_std": 0.5877281427383423, |
| "rewards/reward_func/mean": 0.5303415656089783, |
| "rewards/reward_func/std": 0.5303896069526672, |
| "rewards/soft_overlong_punishment_reward/mean": -0.03125, |
| "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, |
| "sampling/importance_sampling_ratio/max": 2.9357402324676514, |
| "sampling/importance_sampling_ratio/mean": 0.9906986951828003, |
| "sampling/importance_sampling_ratio/min": 0.20663174986839294, |
| "sampling/sampling_logp_difference/max": 1.5768170356750488, |
| "sampling/sampling_logp_difference/mean": 0.01940227299928665, |
| "step": 26, |
| "step_time": 618.8288546381518 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 19929.0, |
| "completions/mean_length": 12748.8125, |
| "completions/mean_terminated_length": 12499.4189453125, |
| "completions/min_length": 8056.0, |
| "completions/min_terminated_length": 8056.0, |
| "entropy": 0.2515546875074506, |
| "epoch": 0.21951219512195122, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3102463185787201, |
| "learning_rate": 8.96e-07, |
| "loss": 0.0175, |
| "num_tokens": 9609168.0, |
| "reward": 0.390100359916687, |
| "reward_std": 0.6925687789916992, |
| "rewards/reward_func/mean": 0.4781709611415863, |
| "rewards/reward_func/std": 0.6022319793701172, |
| "rewards/soft_overlong_punishment_reward/mean": -0.07312774658203125, |
| "rewards/soft_overlong_punishment_reward/std": 0.23431003093719482, |
| "sampling/importance_sampling_ratio/max": 2.518101215362549, |
| "sampling/importance_sampling_ratio/mean": 0.9911104440689087, |
| "sampling/importance_sampling_ratio/min": 0.22955988347530365, |
| "sampling/sampling_logp_difference/max": 1.4715913534164429, |
| "sampling/sampling_logp_difference/mean": 0.018376227468252182, |
| "step": 27, |
| "step_time": 459.3379825237207 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13431.0, |
| "completions/max_terminated_length": 13431.0, |
| "completions/mean_length": 8919.34375, |
| "completions/mean_terminated_length": 8919.34375, |
| "completions/min_length": 5918.0, |
| "completions/min_terminated_length": 5918.0, |
| "entropy": 0.30696201138198376, |
| "epoch": 0.22764227642276422, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4097852408885956, |
| "learning_rate": 8.92e-07, |
| "loss": 0.0214, |
| "num_tokens": 9904035.0, |
| "reward": 0.961408257484436, |
| "reward_std": 0.7320297360420227, |
| "rewards/reward_func/mean": 0.961408257484436, |
| "rewards/reward_func/std": 0.7320297360420227, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.4255101680755615, |
| "sampling/importance_sampling_ratio/mean": 0.9893720149993896, |
| "sampling/importance_sampling_ratio/min": 0.06206132099032402, |
| "sampling/sampling_logp_difference/max": 2.779632329940796, |
| "sampling/sampling_logp_difference/mean": 0.02153138443827629, |
| "step": 28, |
| "step_time": 289.10520208696835 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16485.0, |
| "completions/max_terminated_length": 16485.0, |
| "completions/mean_length": 11523.40625, |
| "completions/mean_terminated_length": 11523.40625, |
| "completions/min_length": 7697.0, |
| "completions/min_terminated_length": 7697.0, |
| "entropy": 0.278985645622015, |
| "epoch": 0.23577235772357724, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3535861670970917, |
| "learning_rate": 8.88e-07, |
| "loss": 0.0286, |
| "num_tokens": 10295952.0, |
| "reward": 0.4712695777416229, |
| "reward_std": 0.5820122361183167, |
| "rewards/reward_func/mean": 0.4720401465892792, |
| "rewards/reward_func/std": 0.5814188122749329, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00077056884765625, |
| "rewards/soft_overlong_punishment_reward/std": 0.004358995705842972, |
| "sampling/importance_sampling_ratio/max": 2.9528937339782715, |
| "sampling/importance_sampling_ratio/mean": 0.9902883768081665, |
| "sampling/importance_sampling_ratio/min": 0.06855468451976776, |
| "sampling/sampling_logp_difference/max": 2.6801235675811768, |
| "sampling/sampling_logp_difference/mean": 0.0203237347304821, |
| "step": 29, |
| "step_time": 426.5832918223459 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15061.0, |
| "completions/max_terminated_length": 15061.0, |
| "completions/mean_length": 9198.8125, |
| "completions/mean_terminated_length": 9198.8125, |
| "completions/min_length": 3636.0, |
| "completions/min_terminated_length": 3636.0, |
| "entropy": 0.3061511926352978, |
| "epoch": 0.24390243902439024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4075096547603607, |
| "learning_rate": 8.839999999999999e-07, |
| "loss": 0.0272, |
| "num_tokens": 10603946.0, |
| "reward": 0.4550362825393677, |
| "reward_std": 0.4571691155433655, |
| "rewards/reward_func/mean": 0.4550362825393677, |
| "rewards/reward_func/std": 0.4571691155433655, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.989551305770874, |
| "sampling/importance_sampling_ratio/min": 0.4050315022468567, |
| "sampling/sampling_logp_difference/max": 1.1088628768920898, |
| "sampling/sampling_logp_difference/mean": 0.021421968936920166, |
| "step": 30, |
| "step_time": 363.5614477007184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15959.0, |
| "completions/max_terminated_length": 15959.0, |
| "completions/mean_length": 11325.1875, |
| "completions/mean_terminated_length": 11325.1875, |
| "completions/min_length": 8080.0, |
| "completions/min_terminated_length": 8080.0, |
| "entropy": 0.28197295404970646, |
| "epoch": 0.25203252032520324, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3684118986129761, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0276, |
| "num_tokens": 10989504.0, |
| "reward": 0.864295244216919, |
| "reward_std": 1.0850777626037598, |
| "rewards/reward_func/mean": 0.864295244216919, |
| "rewards/reward_func/std": 1.0850777626037598, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901672601699829, |
| "sampling/importance_sampling_ratio/min": 0.0023719461169093847, |
| "sampling/sampling_logp_difference/max": 6.044044494628906, |
| "sampling/sampling_logp_difference/mean": 0.020149797201156616, |
| "step": 31, |
| "step_time": 429.3172168934252 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17952.0, |
| "completions/max_terminated_length": 17952.0, |
| "completions/mean_length": 10753.03125, |
| "completions/mean_terminated_length": 10753.03125, |
| "completions/min_length": 6510.0, |
| "completions/min_terminated_length": 6510.0, |
| "entropy": 0.2921921294182539, |
| "epoch": 0.2601626016260163, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3402082920074463, |
| "learning_rate": 8.76e-07, |
| "loss": 0.0616, |
| "num_tokens": 11344297.0, |
| "reward": 1.0129244327545166, |
| "reward_std": 0.6747909784317017, |
| "rewards/reward_func/mean": 1.0248873233795166, |
| "rewards/reward_func/std": 0.6532120108604431, |
| "rewards/soft_overlong_punishment_reward/mean": -0.011962890625, |
| "rewards/soft_overlong_punishment_reward/std": 0.06767232716083527, |
| "sampling/importance_sampling_ratio/max": 2.011244058609009, |
| "sampling/importance_sampling_ratio/mean": 0.990020751953125, |
| "sampling/importance_sampling_ratio/min": 0.2620807886123657, |
| "sampling/sampling_logp_difference/max": 1.3391025066375732, |
| "sampling/sampling_logp_difference/mean": 0.020625203847885132, |
| "step": 32, |
| "step_time": 405.72090286947787 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17144.0, |
| "completions/max_terminated_length": 17144.0, |
| "completions/mean_length": 10907.46875, |
| "completions/mean_terminated_length": 10907.46875, |
| "completions/min_length": 4381.0, |
| "completions/min_terminated_length": 4381.0, |
| "entropy": 0.2899629846215248, |
| "epoch": 0.2682926829268293, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38532310724258423, |
| "learning_rate": 8.72e-07, |
| "loss": 0.0188, |
| "num_tokens": 11715560.0, |
| "reward": 0.5839906930923462, |
| "reward_std": 0.715369462966919, |
| "rewards/reward_func/mean": 0.5969988107681274, |
| "rewards/reward_func/std": 0.7036058902740479, |
| "rewards/soft_overlong_punishment_reward/mean": -0.01300811767578125, |
| "rewards/soft_overlong_punishment_reward/std": 0.0447469986975193, |
| "sampling/importance_sampling_ratio/max": 2.857685089111328, |
| "sampling/importance_sampling_ratio/mean": 0.9903793931007385, |
| "sampling/importance_sampling_ratio/min": 0.16617049276828766, |
| "sampling/sampling_logp_difference/max": 1.794740915298462, |
| "sampling/sampling_logp_difference/mean": 0.01988939195871353, |
| "step": 33, |
| "step_time": 451.89993859338574 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15459.0, |
| "completions/max_terminated_length": 15459.0, |
| "completions/mean_length": 10224.9375, |
| "completions/mean_terminated_length": 10224.9375, |
| "completions/min_length": 6120.0, |
| "completions/min_terminated_length": 6120.0, |
| "entropy": 0.29201703891158104, |
| "epoch": 0.2764227642276423, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4802566170692444, |
| "learning_rate": 8.68e-07, |
| "loss": -0.0018, |
| "num_tokens": 12058662.0, |
| "reward": 0.6011689901351929, |
| "reward_std": 0.6944944262504578, |
| "rewards/reward_func/mean": 0.6011689901351929, |
| "rewards/reward_func/std": 0.6944944262504578, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0942885875701904, |
| "sampling/importance_sampling_ratio/mean": 0.9899111390113831, |
| "sampling/importance_sampling_ratio/min": 0.048549018800258636, |
| "sampling/sampling_logp_difference/max": 3.025181293487549, |
| "sampling/sampling_logp_difference/mean": 0.02062905579805374, |
| "step": 34, |
| "step_time": 560.5957993268967 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14074.0, |
| "completions/max_terminated_length": 14074.0, |
| "completions/mean_length": 9658.96875, |
| "completions/mean_terminated_length": 9658.96875, |
| "completions/min_length": 5878.0, |
| "completions/min_terminated_length": 5878.0, |
| "entropy": 0.28341494500637054, |
| "epoch": 0.2845528455284553, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46002474427223206, |
| "learning_rate": 8.639999999999999e-07, |
| "loss": 0.0129, |
| "num_tokens": 12387477.0, |
| "reward": 1.293872594833374, |
| "reward_std": 1.2786844968795776, |
| "rewards/reward_func/mean": 1.293872594833374, |
| "rewards/reward_func/std": 1.2786844968795776, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.6462442874908447, |
| "sampling/importance_sampling_ratio/mean": 0.9899374842643738, |
| "sampling/importance_sampling_ratio/min": 0.004488667007535696, |
| "sampling/sampling_logp_difference/max": 5.4061994552612305, |
| "sampling/sampling_logp_difference/mean": 0.02022422105073929, |
| "step": 35, |
| "step_time": 324.04454420367256 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16723.0, |
| "completions/max_terminated_length": 16723.0, |
| "completions/mean_length": 10310.90625, |
| "completions/mean_terminated_length": 10310.90625, |
| "completions/min_length": 6380.0, |
| "completions/min_terminated_length": 6380.0, |
| "entropy": 0.26594763714820147, |
| "epoch": 0.2926829268292683, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3799055516719818, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0193, |
| "num_tokens": 12743762.0, |
| "reward": 1.0520066022872925, |
| "reward_std": 1.5496177673339844, |
| "rewards/reward_func/mean": 1.0545929670333862, |
| "rewards/reward_func/std": 1.5477306842803955, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00258636474609375, |
| "rewards/soft_overlong_punishment_reward/std": 0.014630688354372978, |
| "sampling/importance_sampling_ratio/max": 2.5548853874206543, |
| "sampling/importance_sampling_ratio/mean": 0.9906677007675171, |
| "sampling/importance_sampling_ratio/min": 0.34483057260513306, |
| "sampling/sampling_logp_difference/max": 1.064702033996582, |
| "sampling/sampling_logp_difference/mean": 0.018773481249809265, |
| "step": 36, |
| "step_time": 596.4897175964434 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14483.0, |
| "completions/max_terminated_length": 14483.0, |
| "completions/mean_length": 8411.0, |
| "completions/mean_terminated_length": 8411.0, |
| "completions/min_length": 2507.0, |
| "completions/min_terminated_length": 2507.0, |
| "entropy": 0.28924608789384365, |
| "epoch": 0.3008130081300813, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.44713178277015686, |
| "learning_rate": 8.559999999999999e-07, |
| "loss": -0.0091, |
| "num_tokens": 13027682.0, |
| "reward": 0.9169277548789978, |
| "reward_std": 0.6975835561752319, |
| "rewards/reward_func/mean": 0.9169277548789978, |
| "rewards/reward_func/std": 0.6975834965705872, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9900239706039429, |
| "sampling/importance_sampling_ratio/min": 0.2362688183784485, |
| "sampling/sampling_logp_difference/max": 1.4427850246429443, |
| "sampling/sampling_logp_difference/mean": 0.020912623032927513, |
| "step": 37, |
| "step_time": 514.7615225652698 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14294.0, |
| "completions/max_terminated_length": 14294.0, |
| "completions/mean_length": 10098.125, |
| "completions/mean_terminated_length": 10098.125, |
| "completions/min_length": 5654.0, |
| "completions/min_terminated_length": 5654.0, |
| "entropy": 0.2681300761178136, |
| "epoch": 0.3089430894308943, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3977094292640686, |
| "learning_rate": 8.52e-07, |
| "loss": 0.0076, |
| "num_tokens": 13373614.0, |
| "reward": 0.3022610545158386, |
| "reward_std": 0.37053707242012024, |
| "rewards/reward_func/mean": 0.3022610545158386, |
| "rewards/reward_func/std": 0.37053707242012024, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9909188747406006, |
| "sampling/importance_sampling_ratio/min": 0.13540859520435333, |
| "sampling/sampling_logp_difference/max": 1.9994584321975708, |
| "sampling/sampling_logp_difference/mean": 0.018634535372257233, |
| "step": 38, |
| "step_time": 455.13201168039814 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15760.0, |
| "completions/max_terminated_length": 15760.0, |
| "completions/mean_length": 11131.75, |
| "completions/mean_terminated_length": 11131.75, |
| "completions/min_length": 8044.0, |
| "completions/min_terminated_length": 8044.0, |
| "entropy": 0.2536044828593731, |
| "epoch": 0.3170731707317073, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40394505858421326, |
| "learning_rate": 8.48e-07, |
| "loss": 0.0142, |
| "num_tokens": 13752318.0, |
| "reward": 0.5811458230018616, |
| "reward_std": 0.6193008422851562, |
| "rewards/reward_func/mean": 0.5811458230018616, |
| "rewards/reward_func/std": 0.6193007826805115, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9912146329879761, |
| "sampling/importance_sampling_ratio/min": 0.02844265289604664, |
| "sampling/sampling_logp_difference/max": 3.5598654747009277, |
| "sampling/sampling_logp_difference/mean": 0.018120869994163513, |
| "step": 39, |
| "step_time": 425.23402834986337 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14644.0, |
| "completions/max_terminated_length": 14644.0, |
| "completions/mean_length": 7563.40625, |
| "completions/mean_terminated_length": 7563.40625, |
| "completions/min_length": 2451.0, |
| "completions/min_terminated_length": 2451.0, |
| "entropy": 0.30642482824623585, |
| "epoch": 0.3252032520325203, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.4431585669517517, |
| "learning_rate": 8.439999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 14004859.0, |
| "reward": 0.8984022736549377, |
| "reward_std": 0.7316007614135742, |
| "rewards/reward_func/mean": 0.8984022736549377, |
| "rewards/reward_func/std": 0.7316007018089294, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.272446870803833, |
| "sampling/importance_sampling_ratio/mean": 0.9895340204238892, |
| "sampling/importance_sampling_ratio/min": 0.10593052953481674, |
| "sampling/sampling_logp_difference/max": 2.244971752166748, |
| "sampling/sampling_logp_difference/mean": 0.021232325583696365, |
| "step": 40, |
| "step_time": 422.6681289859116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 11990.0, |
| "completions/max_terminated_length": 11990.0, |
| "completions/mean_length": 8507.84375, |
| "completions/mean_terminated_length": 8507.84375, |
| "completions/min_length": 4808.0, |
| "completions/min_terminated_length": 4808.0, |
| "entropy": 0.29619857482612133, |
| "epoch": 0.3333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4351663589477539, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": 0.029, |
| "num_tokens": 14290710.0, |
| "reward": 1.4823139905929565, |
| "reward_std": 0.973460853099823, |
| "rewards/reward_func/mean": 1.4823139905929565, |
| "rewards/reward_func/std": 0.973460853099823, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9896551370620728, |
| "sampling/importance_sampling_ratio/min": 0.32150232791900635, |
| "sampling/sampling_logp_difference/max": 1.4671876430511475, |
| "sampling/sampling_logp_difference/mean": 0.020717762410640717, |
| "step": 41, |
| "step_time": 281.4393497968558 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16896.0, |
| "completions/max_terminated_length": 16896.0, |
| "completions/mean_length": 11579.0, |
| "completions/mean_terminated_length": 11579.0, |
| "completions/min_length": 7481.0, |
| "completions/min_terminated_length": 7481.0, |
| "entropy": 0.25882996898144484, |
| "epoch": 0.34146341463414637, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3679807484149933, |
| "learning_rate": 8.359999999999999e-07, |
| "loss": 0.032, |
| "num_tokens": 14682342.0, |
| "reward": 2.730255365371704, |
| "reward_std": 13.133237838745117, |
| "rewards/reward_func/mean": 2.734161615371704, |
| "rewards/reward_func/std": 13.132379531860352, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00390625, |
| "rewards/soft_overlong_punishment_reward/std": 0.022097086533904076, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.990980863571167, |
| "sampling/importance_sampling_ratio/min": 0.017838984727859497, |
| "sampling/sampling_logp_difference/max": 4.026369094848633, |
| "sampling/sampling_logp_difference/mean": 0.018732603639364243, |
| "step": 42, |
| "step_time": 419.91274624480866 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15685.0, |
| "completions/max_terminated_length": 15685.0, |
| "completions/mean_length": 12179.1875, |
| "completions/mean_terminated_length": 12179.1875, |
| "completions/min_length": 8132.0, |
| "completions/min_terminated_length": 8132.0, |
| "entropy": 0.2707740031182766, |
| "epoch": 0.34959349593495936, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3744344115257263, |
| "learning_rate": 8.319999999999999e-07, |
| "loss": -0.0033, |
| "num_tokens": 15096724.0, |
| "reward": 0.6232322454452515, |
| "reward_std": 0.8935548663139343, |
| "rewards/reward_func/mean": 0.6232322454452515, |
| "rewards/reward_func/std": 0.8935548067092896, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9905098676681519, |
| "sampling/importance_sampling_ratio/min": 0.30308058857917786, |
| "sampling/sampling_logp_difference/max": 1.1937564611434937, |
| "sampling/sampling_logp_difference/mean": 0.019495470449328423, |
| "step": 43, |
| "step_time": 603.9360419125296 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14580.0, |
| "completions/max_terminated_length": 14580.0, |
| "completions/mean_length": 7975.375, |
| "completions/mean_terminated_length": 7975.375, |
| "completions/min_length": 3591.0, |
| "completions/min_terminated_length": 3591.0, |
| "entropy": 0.3197885435074568, |
| "epoch": 0.35772357723577236, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3738175928592682, |
| "learning_rate": 8.28e-07, |
| "loss": -0.0016, |
| "num_tokens": 15360248.0, |
| "reward": 0.7166826725006104, |
| "reward_std": 0.6693564057350159, |
| "rewards/reward_func/mean": 0.7166826725006104, |
| "rewards/reward_func/std": 0.6693564057350159, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.324842929840088, |
| "sampling/importance_sampling_ratio/mean": 0.9886108040809631, |
| "sampling/importance_sampling_ratio/min": 0.3598307967185974, |
| "sampling/sampling_logp_difference/max": 1.0221214294433594, |
| "sampling/sampling_logp_difference/mean": 0.022567734122276306, |
| "step": 44, |
| "step_time": 345.220199523028 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 19443.0, |
| "completions/mean_length": 10098.34375, |
| "completions/mean_terminated_length": 9024.37890625, |
| "completions/min_length": 2638.0, |
| "completions/min_terminated_length": 2638.0, |
| "entropy": 0.2705630399286747, |
| "epoch": 0.36585365853658536, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3647894561290741, |
| "learning_rate": 8.24e-07, |
| "loss": -0.0494, |
| "num_tokens": 15697299.0, |
| "reward": 0.4475719630718231, |
| "reward_std": 0.769990086555481, |
| "rewards/reward_func/mean": 0.6297746896743774, |
| "rewards/reward_func/std": 0.5889378786087036, |
| "rewards/soft_overlong_punishment_reward/mean": -0.12316131591796875, |
| "rewards/soft_overlong_punishment_reward/std": 0.3167433738708496, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901247024536133, |
| "sampling/importance_sampling_ratio/min": 0.1971900314092636, |
| "sampling/sampling_logp_difference/max": 1.6235873699188232, |
| "sampling/sampling_logp_difference/mean": 0.02022046223282814, |
| "step": 45, |
| "step_time": 460.2549244968686 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15474.0, |
| "completions/max_terminated_length": 15474.0, |
| "completions/mean_length": 10772.21875, |
| "completions/mean_terminated_length": 10772.21875, |
| "completions/min_length": 5099.0, |
| "completions/min_terminated_length": 5099.0, |
| "entropy": 0.28201122768223286, |
| "epoch": 0.37398373983739835, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36336085200309753, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0064, |
| "num_tokens": 16060146.0, |
| "reward": 0.6875309944152832, |
| "reward_std": 0.8879403471946716, |
| "rewards/reward_func/mean": 0.6875309944152832, |
| "rewards/reward_func/std": 0.8879403471946716, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.298703670501709, |
| "sampling/importance_sampling_ratio/mean": 0.9902758598327637, |
| "sampling/importance_sampling_ratio/min": 0.1755373477935791, |
| "sampling/sampling_logp_difference/max": 1.739903450012207, |
| "sampling/sampling_logp_difference/mean": 0.019840704277157784, |
| "step": 46, |
| "step_time": 391.7966483985074 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12558.0, |
| "completions/max_terminated_length": 12558.0, |
| "completions/mean_length": 10090.78125, |
| "completions/mean_terminated_length": 10090.78125, |
| "completions/min_length": 5820.0, |
| "completions/min_terminated_length": 5820.0, |
| "entropy": 0.27039683051407337, |
| "epoch": 0.3821138211382114, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.31569579243659973, |
| "learning_rate": 8.159999999999999e-07, |
| "loss": -0.009, |
| "num_tokens": 16401115.0, |
| "reward": 0.7058383226394653, |
| "reward_std": 0.6476169228553772, |
| "rewards/reward_func/mean": 0.7058383226394653, |
| "rewards/reward_func/std": 0.6476169228553772, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0942885875701904, |
| "sampling/importance_sampling_ratio/mean": 0.9906511902809143, |
| "sampling/importance_sampling_ratio/min": 0.20646513998508453, |
| "sampling/sampling_logp_difference/max": 1.577623724937439, |
| "sampling/sampling_logp_difference/mean": 0.019238049164414406, |
| "step": 47, |
| "step_time": 322.8998982391786 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16934.0, |
| "completions/max_terminated_length": 16934.0, |
| "completions/mean_length": 10200.96875, |
| "completions/mean_terminated_length": 10200.96875, |
| "completions/min_length": 4126.0, |
| "completions/min_terminated_length": 4126.0, |
| "entropy": 0.2863293197005987, |
| "epoch": 0.3902439024390244, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4074859917163849, |
| "learning_rate": 8.12e-07, |
| "loss": 0.0823, |
| "num_tokens": 16748266.0, |
| "reward": 2.437450647354126, |
| "reward_std": 4.401010990142822, |
| "rewards/reward_func/mean": 2.4416468143463135, |
| "rewards/reward_func/std": 4.398542881011963, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0041961669921875, |
| "rewards/soft_overlong_punishment_reward/std": 0.02373710460960865, |
| "sampling/importance_sampling_ratio/max": 2.655956745147705, |
| "sampling/importance_sampling_ratio/mean": 0.9903603196144104, |
| "sampling/importance_sampling_ratio/min": 0.21017666161060333, |
| "sampling/sampling_logp_difference/max": 1.5598068237304688, |
| "sampling/sampling_logp_difference/mean": 0.02004138007760048, |
| "step": 48, |
| "step_time": 607.1050487488974 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15864.0, |
| "completions/max_terminated_length": 15864.0, |
| "completions/mean_length": 10736.78125, |
| "completions/mean_terminated_length": 10736.78125, |
| "completions/min_length": 7612.0, |
| "completions/min_terminated_length": 7612.0, |
| "entropy": 0.26676939986646175, |
| "epoch": 0.3983739837398374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3676176965236664, |
| "learning_rate": 8.08e-07, |
| "loss": -0.0161, |
| "num_tokens": 17112683.0, |
| "reward": 1.2646254301071167, |
| "reward_std": 0.9504536390304565, |
| "rewards/reward_func/mean": 1.2646254301071167, |
| "rewards/reward_func/std": 0.9504537582397461, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9370557069778442, |
| "sampling/importance_sampling_ratio/mean": 0.9906842708587646, |
| "sampling/importance_sampling_ratio/min": 0.3161299228668213, |
| "sampling/sampling_logp_difference/max": 1.151602029800415, |
| "sampling/sampling_logp_difference/mean": 0.019057895988225937, |
| "step": 49, |
| "step_time": 552.4623964948114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17648.0, |
| "completions/max_terminated_length": 17648.0, |
| "completions/mean_length": 12040.8125, |
| "completions/mean_terminated_length": 12040.8125, |
| "completions/min_length": 7400.0, |
| "completions/min_terminated_length": 7400.0, |
| "entropy": 0.26970171742141247, |
| "epoch": 0.4065040650406504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35298776626586914, |
| "learning_rate": 8.04e-07, |
| "loss": 0.0134, |
| "num_tokens": 17522533.0, |
| "reward": 0.883129358291626, |
| "reward_std": 0.6038604974746704, |
| "rewards/reward_func/mean": 0.892772912979126, |
| "rewards/reward_func/std": 0.5872755646705627, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0096435546875, |
| "rewards/soft_overlong_punishment_reward/std": 0.05455218255519867, |
| "sampling/importance_sampling_ratio/max": 2.493983745574951, |
| "sampling/importance_sampling_ratio/mean": 0.9904340505599976, |
| "sampling/importance_sampling_ratio/min": 0.058902557939291, |
| "sampling/sampling_logp_difference/max": 2.8318707942962646, |
| "sampling/sampling_logp_difference/mean": 0.01974073052406311, |
| "step": 50, |
| "step_time": 399.5289772397373 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16737.0, |
| "completions/max_terminated_length": 16737.0, |
| "completions/mean_length": 11610.8125, |
| "completions/mean_terminated_length": 11610.8125, |
| "completions/min_length": 6722.0, |
| "completions/min_terminated_length": 6722.0, |
| "entropy": 0.27386337146162987, |
| "epoch": 0.4146341463414634, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3504253029823303, |
| "learning_rate": 8e-07, |
| "loss": 0.0388, |
| "num_tokens": 17911655.0, |
| "reward": 0.27469968795776367, |
| "reward_std": 0.5160353183746338, |
| "rewards/reward_func/mean": 0.2773928642272949, |
| "rewards/reward_func/std": 0.5143131017684937, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00269317626953125, |
| "rewards/soft_overlong_punishment_reward/std": 0.015234904363751411, |
| "sampling/importance_sampling_ratio/max": 2.0953574180603027, |
| "sampling/importance_sampling_ratio/mean": 0.9904600381851196, |
| "sampling/importance_sampling_ratio/min": 0.2542196810245514, |
| "sampling/sampling_logp_difference/max": 1.3695564270019531, |
| "sampling/sampling_logp_difference/mean": 0.019640503451228142, |
| "step": 51, |
| "step_time": 399.3099301927723 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15086.0, |
| "completions/max_terminated_length": 15086.0, |
| "completions/mean_length": 10634.1875, |
| "completions/mean_terminated_length": 10634.1875, |
| "completions/min_length": 6556.0, |
| "completions/min_terminated_length": 6556.0, |
| "entropy": 0.30065641924738884, |
| "epoch": 0.42276422764227645, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45073068141937256, |
| "learning_rate": 7.96e-07, |
| "loss": -0.0302, |
| "num_tokens": 18269029.0, |
| "reward": 0.7840328216552734, |
| "reward_std": 0.9923391342163086, |
| "rewards/reward_func/mean": 0.7840328216552734, |
| "rewards/reward_func/std": 0.9923390746116638, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3403565883636475, |
| "sampling/importance_sampling_ratio/mean": 0.9895017147064209, |
| "sampling/importance_sampling_ratio/min": 0.001712719677016139, |
| "sampling/sampling_logp_difference/max": 6.369672775268555, |
| "sampling/sampling_logp_difference/mean": 0.021533269435167313, |
| "step": 52, |
| "step_time": 380.3188033842016 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14572.0, |
| "completions/max_terminated_length": 14572.0, |
| "completions/mean_length": 10499.125, |
| "completions/mean_terminated_length": 10499.125, |
| "completions/min_length": 6328.0, |
| "completions/min_terminated_length": 6328.0, |
| "entropy": 0.283049238845706, |
| "epoch": 0.43089430894308944, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37461337447166443, |
| "learning_rate": 7.92e-07, |
| "loss": -0.0126, |
| "num_tokens": 18619073.0, |
| "reward": 0.9159905910491943, |
| "reward_std": 0.7809432744979858, |
| "rewards/reward_func/mean": 0.9159905910491943, |
| "rewards/reward_func/std": 0.7809432148933411, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.277055263519287, |
| "sampling/importance_sampling_ratio/mean": 0.9902773499488831, |
| "sampling/importance_sampling_ratio/min": 0.279052734375, |
| "sampling/sampling_logp_difference/max": 1.276354432106018, |
| "sampling/sampling_logp_difference/mean": 0.01988813653588295, |
| "step": 53, |
| "step_time": 503.7549276971258 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14762.0, |
| "completions/max_terminated_length": 14762.0, |
| "completions/mean_length": 10943.90625, |
| "completions/mean_terminated_length": 10943.90625, |
| "completions/min_length": 7617.0, |
| "completions/min_terminated_length": 7617.0, |
| "entropy": 0.2909552175551653, |
| "epoch": 0.43902439024390244, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40936458110809326, |
| "learning_rate": 7.88e-07, |
| "loss": -0.0008, |
| "num_tokens": 18986318.0, |
| "reward": 0.9623677730560303, |
| "reward_std": 0.6776258945465088, |
| "rewards/reward_func/mean": 0.9623677730560303, |
| "rewards/reward_func/std": 0.677625834941864, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.733780860900879, |
| "sampling/importance_sampling_ratio/mean": 0.9898478388786316, |
| "sampling/importance_sampling_ratio/min": 0.11436156183481216, |
| "sampling/sampling_logp_difference/max": 2.1683902740478516, |
| "sampling/sampling_logp_difference/mean": 0.020851192995905876, |
| "step": 54, |
| "step_time": 412.2992678086739 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13410.0, |
| "completions/max_terminated_length": 13410.0, |
| "completions/mean_length": 8160.375, |
| "completions/mean_terminated_length": 8160.375, |
| "completions/min_length": 3340.0, |
| "completions/min_terminated_length": 3340.0, |
| "entropy": 0.29048606380820274, |
| "epoch": 0.44715447154471544, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46355557441711426, |
| "learning_rate": 7.84e-07, |
| "loss": 0.0469, |
| "num_tokens": 19264786.0, |
| "reward": 1.9834721088409424, |
| "reward_std": 1.6326876878738403, |
| "rewards/reward_func/mean": 1.9834721088409424, |
| "rewards/reward_func/std": 1.6326874494552612, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3705220222473145, |
| "sampling/importance_sampling_ratio/mean": 0.9901043176651001, |
| "sampling/importance_sampling_ratio/min": 0.23068562150001526, |
| "sampling/sampling_logp_difference/max": 1.466699481010437, |
| "sampling/sampling_logp_difference/mean": 0.01999868080019951, |
| "step": 55, |
| "step_time": 352.4861610988155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16042.0, |
| "completions/max_terminated_length": 16042.0, |
| "completions/mean_length": 11029.0, |
| "completions/mean_terminated_length": 11029.0, |
| "completions/min_length": 7361.0, |
| "completions/min_terminated_length": 7361.0, |
| "entropy": 0.2770060170441866, |
| "epoch": 0.45528455284552843, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35951629281044006, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": 0.01, |
| "num_tokens": 19632250.0, |
| "reward": 0.7494408488273621, |
| "reward_std": 0.6710403561592102, |
| "rewards/reward_func/mean": 0.7494408488273621, |
| "rewards/reward_func/std": 0.6710402965545654, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9904565811157227, |
| "sampling/importance_sampling_ratio/min": 0.19462724030017853, |
| "sampling/sampling_logp_difference/max": 1.6366691589355469, |
| "sampling/sampling_logp_difference/mean": 0.01998082734644413, |
| "step": 56, |
| "step_time": 580.471678117523 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16223.0, |
| "completions/max_terminated_length": 16223.0, |
| "completions/mean_length": 10092.53125, |
| "completions/mean_terminated_length": 10092.53125, |
| "completions/min_length": 3841.0, |
| "completions/min_terminated_length": 3841.0, |
| "entropy": 0.27271045185625553, |
| "epoch": 0.4634146341463415, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40269845724105835, |
| "learning_rate": 7.76e-07, |
| "loss": 0.0223, |
| "num_tokens": 19975051.0, |
| "reward": 0.7949599027633667, |
| "reward_std": 0.6612140536308289, |
| "rewards/reward_func/mean": 0.7949599027633667, |
| "rewards/reward_func/std": 0.6612139940261841, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906999468803406, |
| "sampling/importance_sampling_ratio/min": 0.2476702779531479, |
| "sampling/sampling_logp_difference/max": 2.0463411808013916, |
| "sampling/sampling_logp_difference/mean": 0.019406329840421677, |
| "step": 57, |
| "step_time": 399.2233660905622 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15678.0, |
| "completions/max_terminated_length": 15678.0, |
| "completions/mean_length": 10561.5, |
| "completions/mean_terminated_length": 10561.5, |
| "completions/min_length": 4751.0, |
| "completions/min_terminated_length": 4751.0, |
| "entropy": 0.2965373918414116, |
| "epoch": 0.4715447154471545, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3839688003063202, |
| "learning_rate": 7.72e-07, |
| "loss": 0.0454, |
| "num_tokens": 20327715.0, |
| "reward": 0.7230905294418335, |
| "reward_std": 0.6203462481498718, |
| "rewards/reward_func/mean": 0.7230905294418335, |
| "rewards/reward_func/std": 0.6203462481498718, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.706266164779663, |
| "sampling/importance_sampling_ratio/mean": 0.9895825982093811, |
| "sampling/importance_sampling_ratio/min": 0.18361897766590118, |
| "sampling/sampling_logp_difference/max": 1.694892406463623, |
| "sampling/sampling_logp_difference/mean": 0.020958570763468742, |
| "step": 58, |
| "step_time": 389.2628999436274 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17072.0, |
| "completions/max_terminated_length": 17072.0, |
| "completions/mean_length": 9915.34375, |
| "completions/mean_terminated_length": 9915.34375, |
| "completions/min_length": 4302.0, |
| "completions/min_terminated_length": 4302.0, |
| "entropy": 0.3017471991479397, |
| "epoch": 0.4796747967479675, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4128898084163666, |
| "learning_rate": 7.68e-07, |
| "loss": 0.0444, |
| "num_tokens": 20657838.0, |
| "reward": 1.6393874883651733, |
| "reward_std": 1.3633579015731812, |
| "rewards/reward_func/mean": 1.6446365118026733, |
| "rewards/reward_func/std": 1.3566806316375732, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0052490234375, |
| "rewards/soft_overlong_punishment_reward/std": 0.029692960903048515, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9894176721572876, |
| "sampling/importance_sampling_ratio/min": 0.06410310417413712, |
| "sampling/sampling_logp_difference/max": 2.747262477874756, |
| "sampling/sampling_logp_difference/mean": 0.021471522748470306, |
| "step": 59, |
| "step_time": 475.307358373655 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 19170.0, |
| "completions/max_terminated_length": 19170.0, |
| "completions/mean_length": 9260.875, |
| "completions/mean_terminated_length": 9260.875, |
| "completions/min_length": 1932.0, |
| "completions/min_terminated_length": 1932.0, |
| "entropy": 0.28078791592270136, |
| "epoch": 0.4878048780487805, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4470166265964508, |
| "learning_rate": 7.64e-07, |
| "loss": 0.0672, |
| "num_tokens": 20979498.0, |
| "reward": 0.8571816682815552, |
| "reward_std": 0.9841914176940918, |
| "rewards/reward_func/mean": 0.8915902376174927, |
| "rewards/reward_func/std": 0.9419331550598145, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0344085693359375, |
| "rewards/soft_overlong_punishment_reward/std": 0.13934272527694702, |
| "sampling/importance_sampling_ratio/max": 2.326169729232788, |
| "sampling/importance_sampling_ratio/mean": 0.9909319877624512, |
| "sampling/importance_sampling_ratio/min": 0.1588689088821411, |
| "sampling/sampling_logp_difference/max": 1.8396759033203125, |
| "sampling/sampling_logp_difference/mean": 0.01866196282207966, |
| "step": 60, |
| "step_time": 525.5024625072256 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17185.0, |
| "completions/max_terminated_length": 17185.0, |
| "completions/mean_length": 10522.3125, |
| "completions/mean_terminated_length": 10522.3125, |
| "completions/min_length": 5064.0, |
| "completions/min_terminated_length": 5064.0, |
| "entropy": 0.29093262180685997, |
| "epoch": 0.4959349593495935, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4672902822494507, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": -0.0478, |
| "num_tokens": 21332124.0, |
| "reward": 0.6969435214996338, |
| "reward_std": 0.6751585602760315, |
| "rewards/reward_func/mean": 0.703054666519165, |
| "rewards/reward_func/std": 0.6807836890220642, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00611114501953125, |
| "rewards/soft_overlong_punishment_reward/std": 0.03456985577940941, |
| "sampling/importance_sampling_ratio/max": 2.0167672634124756, |
| "sampling/importance_sampling_ratio/mean": 0.9900118112564087, |
| "sampling/importance_sampling_ratio/min": 0.29739901423454285, |
| "sampling/sampling_logp_difference/max": 1.2126805782318115, |
| "sampling/sampling_logp_difference/mean": 0.020228687673807144, |
| "step": 61, |
| "step_time": 385.8926363585051 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 18314.0, |
| "completions/max_terminated_length": 18314.0, |
| "completions/mean_length": 10564.25, |
| "completions/mean_terminated_length": 10564.25, |
| "completions/min_length": 7173.0, |
| "completions/min_terminated_length": 7173.0, |
| "entropy": 0.2720603086054325, |
| "epoch": 0.5040650406504065, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.443337082862854, |
| "learning_rate": 7.559999999999999e-07, |
| "loss": -0.0205, |
| "num_tokens": 21691372.0, |
| "reward": 1.099975347518921, |
| "reward_std": 0.7780046463012695, |
| "rewards/reward_func/mean": 1.1196210384368896, |
| "rewards/reward_func/std": 0.7830961346626282, |
| "rewards/soft_overlong_punishment_reward/mean": -0.01964569091796875, |
| "rewards/soft_overlong_punishment_reward/std": 0.08696826547384262, |
| "sampling/importance_sampling_ratio/max": 2.837948799133301, |
| "sampling/importance_sampling_ratio/mean": 0.990537166595459, |
| "sampling/importance_sampling_ratio/min": 0.06719715893268585, |
| "sampling/sampling_logp_difference/max": 2.7001242637634277, |
| "sampling/sampling_logp_difference/mean": 0.019431207329034805, |
| "step": 62, |
| "step_time": 443.5746869649738 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14595.0, |
| "completions/max_terminated_length": 14595.0, |
| "completions/mean_length": 10805.375, |
| "completions/mean_terminated_length": 10805.375, |
| "completions/min_length": 5937.0, |
| "completions/min_terminated_length": 5937.0, |
| "entropy": 0.27174041885882616, |
| "epoch": 0.5121951219512195, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5056492686271667, |
| "learning_rate": 7.52e-07, |
| "loss": 0.009, |
| "num_tokens": 22063440.0, |
| "reward": 1.1353144645690918, |
| "reward_std": 0.8601508736610413, |
| "rewards/reward_func/mean": 1.1353144645690918, |
| "rewards/reward_func/std": 0.8601508736610413, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1125802993774414, |
| "sampling/importance_sampling_ratio/mean": 0.9905575513839722, |
| "sampling/importance_sampling_ratio/min": 0.0975913405418396, |
| "sampling/sampling_logp_difference/max": 2.3269665241241455, |
| "sampling/sampling_logp_difference/mean": 0.019539430737495422, |
| "step": 63, |
| "step_time": 380.66839726571925 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14770.0, |
| "completions/max_terminated_length": 14770.0, |
| "completions/mean_length": 9684.9375, |
| "completions/mean_terminated_length": 9684.9375, |
| "completions/min_length": 4925.0, |
| "completions/min_terminated_length": 4925.0, |
| "entropy": 0.27461021207273006, |
| "epoch": 0.5203252032520326, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37078821659088135, |
| "learning_rate": 7.48e-07, |
| "loss": 0.0588, |
| "num_tokens": 22385238.0, |
| "reward": 1.3261408805847168, |
| "reward_std": 0.6705518364906311, |
| "rewards/reward_func/mean": 1.3261408805847168, |
| "rewards/reward_func/std": 0.6705518364906311, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.990297794342041, |
| "sampling/importance_sampling_ratio/min": 0.06889015436172485, |
| "sampling/sampling_logp_difference/max": 2.6752419471740723, |
| "sampling/sampling_logp_difference/mean": 0.0197505634278059, |
| "step": 64, |
| "step_time": 334.4651955710724 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12913.0, |
| "completions/max_terminated_length": 12913.0, |
| "completions/mean_length": 9230.75, |
| "completions/mean_terminated_length": 9230.75, |
| "completions/min_length": 4211.0, |
| "completions/min_terminated_length": 4211.0, |
| "entropy": 0.29658956080675125, |
| "epoch": 0.5284552845528455, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4264526069164276, |
| "learning_rate": 7.44e-07, |
| "loss": -0.0405, |
| "num_tokens": 22691030.0, |
| "reward": 1.2184321880340576, |
| "reward_std": 1.2537956237792969, |
| "rewards/reward_func/mean": 1.2184321880340576, |
| "rewards/reward_func/std": 1.2537956237792969, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5008950233459473, |
| "sampling/importance_sampling_ratio/mean": 0.9899071455001831, |
| "sampling/importance_sampling_ratio/min": 0.25040245056152344, |
| "sampling/sampling_logp_difference/max": 1.3846858739852905, |
| "sampling/sampling_logp_difference/mean": 0.021074209362268448, |
| "step": 65, |
| "step_time": 428.82489290018566 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14838.0, |
| "completions/max_terminated_length": 14838.0, |
| "completions/mean_length": 10752.8125, |
| "completions/mean_terminated_length": 10752.8125, |
| "completions/min_length": 6496.0, |
| "completions/min_terminated_length": 6496.0, |
| "entropy": 0.24973559752106667, |
| "epoch": 0.5365853658536586, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.375764399766922, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0081, |
| "num_tokens": 23060872.0, |
| "reward": 0.6262708306312561, |
| "reward_std": 0.6390258073806763, |
| "rewards/reward_func/mean": 0.6262708306312561, |
| "rewards/reward_func/std": 0.6390258073806763, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5996079444885254, |
| "sampling/importance_sampling_ratio/mean": 0.9914066791534424, |
| "sampling/importance_sampling_ratio/min": 0.24700190126895905, |
| "sampling/sampling_logp_difference/max": 1.3983592987060547, |
| "sampling/sampling_logp_difference/mean": 0.018258236348628998, |
| "step": 66, |
| "step_time": 422.3102921405807 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 19187.0, |
| "completions/max_terminated_length": 19187.0, |
| "completions/mean_length": 10835.5625, |
| "completions/mean_terminated_length": 10835.5625, |
| "completions/min_length": 6083.0, |
| "completions/min_terminated_length": 6083.0, |
| "entropy": 0.2816160637885332, |
| "epoch": 0.5447154471544715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40830785036087036, |
| "learning_rate": 7.359999999999999e-07, |
| "loss": 0.0398, |
| "num_tokens": 23421858.0, |
| "reward": 0.5286274552345276, |
| "reward_std": 0.5896045565605164, |
| "rewards/reward_func/mean": 0.5500126481056213, |
| "rewards/reward_func/std": 0.5576050877571106, |
| "rewards/soft_overlong_punishment_reward/mean": -0.02138519287109375, |
| "rewards/soft_overlong_punishment_reward/std": 0.12097291648387909, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901546239852905, |
| "sampling/importance_sampling_ratio/min": 0.0072767846286296844, |
| "sampling/sampling_logp_difference/max": 4.923066139221191, |
| "sampling/sampling_logp_difference/mean": 0.020440150052309036, |
| "step": 67, |
| "step_time": 582.3627753325272 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14479.0, |
| "completions/max_terminated_length": 14479.0, |
| "completions/mean_length": 9882.84375, |
| "completions/mean_terminated_length": 9882.84375, |
| "completions/min_length": 5433.0, |
| "completions/min_terminated_length": 5433.0, |
| "entropy": 0.3057326711714268, |
| "epoch": 0.5528455284552846, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42447325587272644, |
| "learning_rate": 7.319999999999999e-07, |
| "loss": 0.0463, |
| "num_tokens": 23753165.0, |
| "reward": 1.0461750030517578, |
| "reward_std": 1.0070804357528687, |
| "rewards/reward_func/mean": 1.0461750030517578, |
| "rewards/reward_func/std": 1.007080316543579, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.217542886734009, |
| "sampling/importance_sampling_ratio/mean": 0.989412248134613, |
| "sampling/importance_sampling_ratio/min": 3.028254241144168e-06, |
| "sampling/sampling_logp_difference/max": 12.707524299621582, |
| "sampling/sampling_logp_difference/mean": 0.021606117486953735, |
| "step": 68, |
| "step_time": 338.7651969024446 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16279.0, |
| "completions/max_terminated_length": 16279.0, |
| "completions/mean_length": 9928.25, |
| "completions/mean_terminated_length": 9928.25, |
| "completions/min_length": 5870.0, |
| "completions/min_terminated_length": 5870.0, |
| "entropy": 0.2900556083768606, |
| "epoch": 0.5609756097560976, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5427579879760742, |
| "learning_rate": 7.28e-07, |
| "loss": 0.0345, |
| "num_tokens": 24090621.0, |
| "reward": 1.0686248540878296, |
| "reward_std": 0.8566891551017761, |
| "rewards/reward_func/mean": 1.0686248540878296, |
| "rewards/reward_func/std": 0.8566891551017761, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9899448156356812, |
| "sampling/importance_sampling_ratio/min": 1.2153052011854015e-05, |
| "sampling/sampling_logp_difference/max": 11.317930221557617, |
| "sampling/sampling_logp_difference/mean": 0.020517408847808838, |
| "step": 69, |
| "step_time": 445.5511495040264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14088.0, |
| "completions/max_terminated_length": 14088.0, |
| "completions/mean_length": 8179.03125, |
| "completions/mean_terminated_length": 8179.03125, |
| "completions/min_length": 3039.0, |
| "completions/min_terminated_length": 3039.0, |
| "entropy": 0.2969997953623533, |
| "epoch": 0.5691056910569106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4507758319377899, |
| "learning_rate": 7.24e-07, |
| "loss": 0.0584, |
| "num_tokens": 24369758.0, |
| "reward": 1.1863713264465332, |
| "reward_std": 0.8616743087768555, |
| "rewards/reward_func/mean": 1.1863713264465332, |
| "rewards/reward_func/std": 0.8616743087768555, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5807595252990723, |
| "sampling/importance_sampling_ratio/mean": 0.989867091178894, |
| "sampling/importance_sampling_ratio/min": 0.2350781410932541, |
| "sampling/sampling_logp_difference/max": 1.4478373527526855, |
| "sampling/sampling_logp_difference/mean": 0.020613517612218857, |
| "step": 70, |
| "step_time": 346.7764633935876 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13559.0, |
| "completions/max_terminated_length": 13559.0, |
| "completions/mean_length": 10056.625, |
| "completions/mean_terminated_length": 10056.625, |
| "completions/min_length": 6305.0, |
| "completions/min_terminated_length": 6305.0, |
| "entropy": 0.2842098120599985, |
| "epoch": 0.5772357723577236, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39119473099708557, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0067, |
| "num_tokens": 24707362.0, |
| "reward": 1.3005447387695312, |
| "reward_std": 1.250781536102295, |
| "rewards/reward_func/mean": 1.3005447387695312, |
| "rewards/reward_func/std": 1.250781536102295, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.6534061431884766, |
| "sampling/importance_sampling_ratio/mean": 0.9900968670845032, |
| "sampling/importance_sampling_ratio/min": 0.25348275899887085, |
| "sampling/sampling_logp_difference/max": 1.3724594116210938, |
| "sampling/sampling_logp_difference/mean": 0.02001977153122425, |
| "step": 71, |
| "step_time": 538.6896389278118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15581.0, |
| "completions/max_terminated_length": 15581.0, |
| "completions/mean_length": 10180.1875, |
| "completions/mean_terminated_length": 10180.1875, |
| "completions/min_length": 5827.0, |
| "completions/min_terminated_length": 5827.0, |
| "entropy": 0.26517100259661674, |
| "epoch": 0.5853658536585366, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38772645592689514, |
| "learning_rate": 7.159999999999999e-07, |
| "loss": 0.0407, |
| "num_tokens": 25052880.0, |
| "reward": 1.7468311786651611, |
| "reward_std": 4.23643684387207, |
| "rewards/reward_func/mean": 1.7468311786651611, |
| "rewards/reward_func/std": 4.23643684387207, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2769153118133545, |
| "sampling/importance_sampling_ratio/mean": 0.9908599257469177, |
| "sampling/importance_sampling_ratio/min": 0.07548689097166061, |
| "sampling/sampling_logp_difference/max": 2.583796262741089, |
| "sampling/sampling_logp_difference/mean": 0.018888521939516068, |
| "step": 72, |
| "step_time": 384.7713236459531 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14703.0, |
| "completions/max_terminated_length": 14703.0, |
| "completions/mean_length": 10991.09375, |
| "completions/mean_terminated_length": 10991.09375, |
| "completions/min_length": 7374.0, |
| "completions/min_terminated_length": 7374.0, |
| "entropy": 0.2600755328312516, |
| "epoch": 0.5934959349593496, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37328991293907166, |
| "learning_rate": 7.119999999999999e-07, |
| "loss": 0.0361, |
| "num_tokens": 25433011.0, |
| "reward": 0.9751912355422974, |
| "reward_std": 1.0739766359329224, |
| "rewards/reward_func/mean": 0.9751912355422974, |
| "rewards/reward_func/std": 1.0739765167236328, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9908562898635864, |
| "sampling/importance_sampling_ratio/min": 0.12725582718849182, |
| "sampling/sampling_logp_difference/max": 2.061555862426758, |
| "sampling/sampling_logp_difference/mean": 0.01900428719818592, |
| "step": 73, |
| "step_time": 383.6669119540602 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14219.0, |
| "completions/max_terminated_length": 14219.0, |
| "completions/mean_length": 9966.125, |
| "completions/mean_terminated_length": 9966.125, |
| "completions/min_length": 5639.0, |
| "completions/min_terminated_length": 5639.0, |
| "entropy": 0.27439984772354364, |
| "epoch": 0.6016260162601627, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3848857581615448, |
| "learning_rate": 7.079999999999999e-07, |
| "loss": 0.0073, |
| "num_tokens": 25773751.0, |
| "reward": 0.7620717287063599, |
| "reward_std": 0.6133687496185303, |
| "rewards/reward_func/mean": 0.7620717287063599, |
| "rewards/reward_func/std": 0.6133686900138855, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2783548831939697, |
| "sampling/importance_sampling_ratio/mean": 0.9905116558074951, |
| "sampling/importance_sampling_ratio/min": 0.24211688339710236, |
| "sampling/sampling_logp_difference/max": 1.418334722518921, |
| "sampling/sampling_logp_difference/mean": 0.01978294551372528, |
| "step": 74, |
| "step_time": 385.7722886954434 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13233.0, |
| "completions/max_terminated_length": 13233.0, |
| "completions/mean_length": 8152.53125, |
| "completions/mean_terminated_length": 8152.53125, |
| "completions/min_length": 5875.0, |
| "completions/min_terminated_length": 5875.0, |
| "entropy": 0.2985440846532583, |
| "epoch": 0.6097560975609756, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43377742171287537, |
| "learning_rate": 7.04e-07, |
| "loss": -0.035, |
| "num_tokens": 26044104.0, |
| "reward": 1.812477469444275, |
| "reward_std": 1.1994365453720093, |
| "rewards/reward_func/mean": 1.812477469444275, |
| "rewards/reward_func/std": 1.1994365453720093, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.35860276222229, |
| "sampling/importance_sampling_ratio/mean": 0.9896669387817383, |
| "sampling/importance_sampling_ratio/min": 0.06281934678554535, |
| "sampling/sampling_logp_difference/max": 2.7674922943115234, |
| "sampling/sampling_logp_difference/mean": 0.021110281348228455, |
| "step": 75, |
| "step_time": 291.5811023626011 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15224.0, |
| "completions/max_terminated_length": 15224.0, |
| "completions/mean_length": 10790.53125, |
| "completions/mean_terminated_length": 10790.53125, |
| "completions/min_length": 7823.0, |
| "completions/min_terminated_length": 7823.0, |
| "entropy": 0.2837300254032016, |
| "epoch": 0.6178861788617886, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38661646842956543, |
| "learning_rate": 7e-07, |
| "loss": -0.0108, |
| "num_tokens": 26413337.0, |
| "reward": 0.9118523597717285, |
| "reward_std": 0.8564381003379822, |
| "rewards/reward_func/mean": 0.9118523597717285, |
| "rewards/reward_func/std": 0.8564381003379822, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.960435628890991, |
| "sampling/importance_sampling_ratio/mean": 0.9901319742202759, |
| "sampling/importance_sampling_ratio/min": 0.23254553973674774, |
| "sampling/sampling_logp_difference/max": 1.4586691856384277, |
| "sampling/sampling_logp_difference/mean": 0.019878923892974854, |
| "step": 76, |
| "step_time": 380.29050638340414 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16985.0, |
| "completions/max_terminated_length": 16985.0, |
| "completions/mean_length": 9770.53125, |
| "completions/mean_terminated_length": 9770.53125, |
| "completions/min_length": 4901.0, |
| "completions/min_terminated_length": 4901.0, |
| "entropy": 0.2725865785032511, |
| "epoch": 0.6260162601626016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41249147057533264, |
| "learning_rate": 6.959999999999999e-07, |
| "loss": 0.0794, |
| "num_tokens": 26749218.0, |
| "reward": 0.9970647096633911, |
| "reward_std": 0.8713412284851074, |
| "rewards/reward_func/mean": 1.0016499757766724, |
| "rewards/reward_func/std": 0.8657678961753845, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00458526611328125, |
| "rewards/soft_overlong_punishment_reward/std": 0.025938183069229126, |
| "sampling/importance_sampling_ratio/max": 2.4935269355773926, |
| "sampling/importance_sampling_ratio/mean": 0.9904952645301819, |
| "sampling/importance_sampling_ratio/min": 1.930824146256782e-05, |
| "sampling/sampling_logp_difference/max": 10.854978561401367, |
| "sampling/sampling_logp_difference/mean": 0.019513940438628197, |
| "step": 77, |
| "step_time": 404.0872638344299 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 20081.0, |
| "completions/max_terminated_length": 20081.0, |
| "completions/mean_length": 8572.0625, |
| "completions/mean_terminated_length": 8572.0625, |
| "completions/min_length": 4595.0, |
| "completions/min_terminated_length": 4595.0, |
| "entropy": 0.29921186715364456, |
| "epoch": 0.6341463414634146, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4499685764312744, |
| "learning_rate": 6.919999999999999e-07, |
| "loss": 0.0132, |
| "num_tokens": 27042652.0, |
| "reward": 0.7779614329338074, |
| "reward_std": 0.7257394790649414, |
| "rewards/reward_func/mean": 0.8061673045158386, |
| "rewards/reward_func/std": 0.6761706471443176, |
| "rewards/soft_overlong_punishment_reward/mean": -0.02820587158203125, |
| "rewards/soft_overlong_punishment_reward/std": 0.15955650806427002, |
| "sampling/importance_sampling_ratio/max": 2.7844414710998535, |
| "sampling/importance_sampling_ratio/mean": 0.9895604848861694, |
| "sampling/importance_sampling_ratio/min": 0.08751080930233002, |
| "sampling/sampling_logp_difference/max": 2.435992956161499, |
| "sampling/sampling_logp_difference/mean": 0.020626315847039223, |
| "step": 78, |
| "step_time": 375.787244503852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16246.0, |
| "completions/max_terminated_length": 16246.0, |
| "completions/mean_length": 9898.75, |
| "completions/mean_terminated_length": 9898.75, |
| "completions/min_length": 3833.0, |
| "completions/min_terminated_length": 3833.0, |
| "entropy": 0.30376508831977844, |
| "epoch": 0.6422764227642277, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3933331072330475, |
| "learning_rate": 6.879999999999999e-07, |
| "loss": -0.0387, |
| "num_tokens": 27372492.0, |
| "reward": 0.9470673203468323, |
| "reward_std": 0.6477987766265869, |
| "rewards/reward_func/mean": 0.9470673203468323, |
| "rewards/reward_func/std": 0.6477987170219421, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9050569534301758, |
| "sampling/importance_sampling_ratio/mean": 0.9895777106285095, |
| "sampling/importance_sampling_ratio/min": 0.05304478108882904, |
| "sampling/sampling_logp_difference/max": 2.9366188049316406, |
| "sampling/sampling_logp_difference/mean": 0.02129667066037655, |
| "step": 79, |
| "step_time": 359.03310281271115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12132.0, |
| "completions/max_terminated_length": 12132.0, |
| "completions/mean_length": 8229.09375, |
| "completions/mean_terminated_length": 8229.09375, |
| "completions/min_length": 6227.0, |
| "completions/min_terminated_length": 6227.0, |
| "entropy": 0.30129735358059406, |
| "epoch": 0.6504065040650406, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45306941866874695, |
| "learning_rate": 6.84e-07, |
| "loss": 0.0203, |
| "num_tokens": 27650159.0, |
| "reward": 2.131523847579956, |
| "reward_std": 1.2827610969543457, |
| "rewards/reward_func/mean": 2.131523847579956, |
| "rewards/reward_func/std": 1.2827610969543457, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0942087173461914, |
| "sampling/importance_sampling_ratio/mean": 0.989490270614624, |
| "sampling/importance_sampling_ratio/min": 0.02814963459968567, |
| "sampling/sampling_logp_difference/max": 3.570220947265625, |
| "sampling/sampling_logp_difference/mean": 0.02144203893840313, |
| "step": 80, |
| "step_time": 300.18679245212115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15563.0, |
| "completions/max_terminated_length": 15563.0, |
| "completions/mean_length": 10849.8125, |
| "completions/mean_terminated_length": 10849.8125, |
| "completions/min_length": 7417.0, |
| "completions/min_terminated_length": 7417.0, |
| "entropy": 0.2771333325654268, |
| "epoch": 0.6585365853658537, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3706999123096466, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0067, |
| "num_tokens": 28017993.0, |
| "reward": 0.7668748497962952, |
| "reward_std": 0.7185682654380798, |
| "rewards/reward_func/mean": 0.7668748497962952, |
| "rewards/reward_func/std": 0.7185682654380798, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.9884979724884033, |
| "sampling/importance_sampling_ratio/mean": 0.9902896881103516, |
| "sampling/importance_sampling_ratio/min": 0.007819842547178268, |
| "sampling/sampling_logp_difference/max": 4.851090908050537, |
| "sampling/sampling_logp_difference/mean": 0.01989693194627762, |
| "step": 81, |
| "step_time": 368.69332744996063 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14354.0, |
| "completions/max_terminated_length": 14354.0, |
| "completions/mean_length": 10310.65625, |
| "completions/mean_terminated_length": 10310.65625, |
| "completions/min_length": 7593.0, |
| "completions/min_terminated_length": 7593.0, |
| "entropy": 0.2430378319695592, |
| "epoch": 0.6666666666666666, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36904025077819824, |
| "learning_rate": 6.76e-07, |
| "loss": -0.0143, |
| "num_tokens": 28370734.0, |
| "reward": 0.7325789928436279, |
| "reward_std": 0.7706360816955566, |
| "rewards/reward_func/mean": 0.7325789928436279, |
| "rewards/reward_func/std": 0.7706360816955566, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3821725845336914, |
| "sampling/importance_sampling_ratio/mean": 0.9915218949317932, |
| "sampling/importance_sampling_ratio/min": 0.01873675547540188, |
| "sampling/sampling_logp_difference/max": 3.9772682189941406, |
| "sampling/sampling_logp_difference/mean": 0.017709840089082718, |
| "step": 82, |
| "step_time": 554.2226089162286 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14717.0, |
| "completions/max_terminated_length": 14717.0, |
| "completions/mean_length": 10310.9375, |
| "completions/mean_terminated_length": 10310.9375, |
| "completions/min_length": 6474.0, |
| "completions/min_terminated_length": 6474.0, |
| "entropy": 0.2876297067850828, |
| "epoch": 0.6747967479674797, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38776862621307373, |
| "learning_rate": 6.72e-07, |
| "loss": 0.0177, |
| "num_tokens": 28721708.0, |
| "reward": 1.4957520961761475, |
| "reward_std": 1.590112566947937, |
| "rewards/reward_func/mean": 1.4957520961761475, |
| "rewards/reward_func/std": 1.5901126861572266, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.842559576034546, |
| "sampling/importance_sampling_ratio/mean": 0.9902292490005493, |
| "sampling/importance_sampling_ratio/min": 6.962251973163802e-06, |
| "sampling/sampling_logp_difference/max": 11.875007629394531, |
| "sampling/sampling_logp_difference/mean": 0.020037703216075897, |
| "step": 83, |
| "step_time": 359.8615084751509 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14412.0, |
| "completions/max_terminated_length": 14412.0, |
| "completions/mean_length": 9628.71875, |
| "completions/mean_terminated_length": 9628.71875, |
| "completions/min_length": 5146.0, |
| "completions/min_terminated_length": 5146.0, |
| "entropy": 0.2873276565223932, |
| "epoch": 0.6829268292682927, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40790072083473206, |
| "learning_rate": 6.68e-07, |
| "loss": 0.0, |
| "num_tokens": 29047491.0, |
| "reward": 1.1052613258361816, |
| "reward_std": 0.8609189987182617, |
| "rewards/reward_func/mean": 1.1052613258361816, |
| "rewards/reward_func/std": 0.8609189987182617, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2451746463775635, |
| "sampling/importance_sampling_ratio/mean": 0.9900080561637878, |
| "sampling/importance_sampling_ratio/min": 0.3554363250732422, |
| "sampling/sampling_logp_difference/max": 1.0344091653823853, |
| "sampling/sampling_logp_difference/mean": 0.020314738154411316, |
| "step": 84, |
| "step_time": 354.82466936972924 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12796.0, |
| "completions/max_terminated_length": 12796.0, |
| "completions/mean_length": 9634.09375, |
| "completions/mean_terminated_length": 9634.09375, |
| "completions/min_length": 5843.0, |
| "completions/min_terminated_length": 5843.0, |
| "entropy": 0.29404681362211704, |
| "epoch": 0.6910569105691057, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.4186258018016815, |
| "learning_rate": 6.64e-07, |
| "loss": -0.0343, |
| "num_tokens": 29372486.0, |
| "reward": 0.45844361186027527, |
| "reward_std": 0.5968901515007019, |
| "rewards/reward_func/mean": 0.45844361186027527, |
| "rewards/reward_func/std": 0.5968901515007019, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.8098247051239014, |
| "sampling/importance_sampling_ratio/mean": 0.9896739721298218, |
| "sampling/importance_sampling_ratio/min": 2.6880831782705172e-08, |
| "sampling/sampling_logp_difference/max": 17.431852340698242, |
| "sampling/sampling_logp_difference/mean": 0.020807670429348946, |
| "step": 85, |
| "step_time": 520.7830489559565 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14586.0, |
| "completions/max_terminated_length": 14586.0, |
| "completions/mean_length": 10420.28125, |
| "completions/mean_terminated_length": 10420.28125, |
| "completions/min_length": 6195.0, |
| "completions/min_terminated_length": 6195.0, |
| "entropy": 0.27361532766371965, |
| "epoch": 0.6991869918699187, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4719882607460022, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0101, |
| "num_tokens": 29725231.0, |
| "reward": 0.7226468920707703, |
| "reward_std": 0.8586759567260742, |
| "rewards/reward_func/mean": 0.7226468920707703, |
| "rewards/reward_func/std": 0.8586759567260742, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9905459880828857, |
| "sampling/importance_sampling_ratio/min": 0.004422293510288, |
| "sampling/sampling_logp_difference/max": 5.4210968017578125, |
| "sampling/sampling_logp_difference/mean": 0.019551947712898254, |
| "step": 86, |
| "step_time": 431.56240568542853 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15442.0, |
| "completions/max_terminated_length": 15442.0, |
| "completions/mean_length": 9467.71875, |
| "completions/mean_terminated_length": 9467.71875, |
| "completions/min_length": 4182.0, |
| "completions/min_terminated_length": 4182.0, |
| "entropy": 0.2771501960232854, |
| "epoch": 0.7073170731707317, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.373221218585968, |
| "learning_rate": 6.56e-07, |
| "loss": 0.0111, |
| "num_tokens": 30044670.0, |
| "reward": 0.8195620179176331, |
| "reward_std": 0.6958929300308228, |
| "rewards/reward_func/mean": 0.8195620179176331, |
| "rewards/reward_func/std": 0.6958929300308228, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902039766311646, |
| "sampling/importance_sampling_ratio/min": 0.2062566578388214, |
| "sampling/sampling_logp_difference/max": 1.5786339044570923, |
| "sampling/sampling_logp_difference/mean": 0.020228460431098938, |
| "step": 87, |
| "step_time": 379.2372320953291 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13434.0, |
| "completions/max_terminated_length": 13434.0, |
| "completions/mean_length": 7646.875, |
| "completions/mean_terminated_length": 7646.875, |
| "completions/min_length": 2072.0, |
| "completions/min_terminated_length": 2072.0, |
| "entropy": 0.29658396914601326, |
| "epoch": 0.7154471544715447, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4571278393268585, |
| "learning_rate": 6.52e-07, |
| "loss": -0.015, |
| "num_tokens": 30299066.0, |
| "reward": 1.1921298503875732, |
| "reward_std": 0.7886288166046143, |
| "rewards/reward_func/mean": 1.1921298503875732, |
| "rewards/reward_func/std": 0.7886288166046143, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9895877838134766, |
| "sampling/importance_sampling_ratio/min": 0.07900335639715195, |
| "sampling/sampling_logp_difference/max": 2.5382649898529053, |
| "sampling/sampling_logp_difference/mean": 0.021227367222309113, |
| "step": 88, |
| "step_time": 305.5276520336047 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17540.0, |
| "completions/max_terminated_length": 17540.0, |
| "completions/mean_length": 11489.90625, |
| "completions/mean_terminated_length": 11489.90625, |
| "completions/min_length": 5587.0, |
| "completions/min_terminated_length": 5587.0, |
| "entropy": 0.2844330407679081, |
| "epoch": 0.7235772357723578, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3779417872428894, |
| "learning_rate": 6.48e-07, |
| "loss": 0.0617, |
| "num_tokens": 30689791.0, |
| "reward": 0.8213021755218506, |
| "reward_std": 0.6814008355140686, |
| "rewards/reward_func/mean": 0.8301217555999756, |
| "rewards/reward_func/std": 0.6683583855628967, |
| "rewards/soft_overlong_punishment_reward/mean": -0.008819580078125, |
| "rewards/soft_overlong_punishment_reward/std": 0.04989107698202133, |
| "sampling/importance_sampling_ratio/max": 2.003089189529419, |
| "sampling/importance_sampling_ratio/mean": 0.9900344014167786, |
| "sampling/importance_sampling_ratio/min": 0.00883798860013485, |
| "sampling/sampling_logp_difference/max": 4.728695869445801, |
| "sampling/sampling_logp_difference/mean": 0.020483214408159256, |
| "step": 89, |
| "step_time": 492.1519634043798 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15601.0, |
| "completions/max_terminated_length": 15601.0, |
| "completions/mean_length": 9316.0, |
| "completions/mean_terminated_length": 9316.0, |
| "completions/min_length": 4857.0, |
| "completions/min_terminated_length": 4857.0, |
| "entropy": 0.28548908419907093, |
| "epoch": 0.7317073170731707, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40802574157714844, |
| "learning_rate": 6.44e-07, |
| "loss": 0.0538, |
| "num_tokens": 31001015.0, |
| "reward": 0.9615650177001953, |
| "reward_std": 0.8863016366958618, |
| "rewards/reward_func/mean": 0.9615650177001953, |
| "rewards/reward_func/std": 0.8863016366958618, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.969780683517456, |
| "sampling/importance_sampling_ratio/mean": 0.990296483039856, |
| "sampling/importance_sampling_ratio/min": 0.0010702904546633363, |
| "sampling/sampling_logp_difference/max": 6.83982515335083, |
| "sampling/sampling_logp_difference/mean": 0.020029699429869652, |
| "step": 90, |
| "step_time": 341.7149986529257 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15982.0, |
| "completions/max_terminated_length": 15982.0, |
| "completions/mean_length": 10284.1875, |
| "completions/mean_terminated_length": 10284.1875, |
| "completions/min_length": 6201.0, |
| "completions/min_terminated_length": 6201.0, |
| "entropy": 0.2762817107141018, |
| "epoch": 0.7398373983739838, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3966502249240875, |
| "learning_rate": 6.4e-07, |
| "loss": -0.0541, |
| "num_tokens": 31353965.0, |
| "reward": 0.6544884443283081, |
| "reward_std": 0.5549855828285217, |
| "rewards/reward_func/mean": 0.6544884443283081, |
| "rewards/reward_func/std": 0.5549855828285217, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.07761287689209, |
| "sampling/importance_sampling_ratio/mean": 0.9903784990310669, |
| "sampling/importance_sampling_ratio/min": 0.028766363859176636, |
| "sampling/sampling_logp_difference/max": 3.548548460006714, |
| "sampling/sampling_logp_difference/mean": 0.019771484658122063, |
| "step": 91, |
| "step_time": 365.1295900817495 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12146.0, |
| "completions/max_terminated_length": 12146.0, |
| "completions/mean_length": 8731.96875, |
| "completions/mean_terminated_length": 8731.96875, |
| "completions/min_length": 6007.0, |
| "completions/min_terminated_length": 6007.0, |
| "entropy": 0.27454613894224167, |
| "epoch": 0.7479674796747967, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4091607332229614, |
| "learning_rate": 6.36e-07, |
| "loss": 0.0174, |
| "num_tokens": 31652684.0, |
| "reward": 1.158060073852539, |
| "reward_std": 1.1567602157592773, |
| "rewards/reward_func/mean": 1.158060073852539, |
| "rewards/reward_func/std": 1.1567600965499878, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.083181858062744, |
| "sampling/importance_sampling_ratio/mean": 0.9904876947402954, |
| "sampling/importance_sampling_ratio/min": 0.17917752265930176, |
| "sampling/sampling_logp_difference/max": 1.7193782329559326, |
| "sampling/sampling_logp_difference/mean": 0.019298098981380463, |
| "step": 92, |
| "step_time": 288.10143864457496 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14462.0, |
| "completions/max_terminated_length": 14462.0, |
| "completions/mean_length": 10936.0625, |
| "completions/mean_terminated_length": 10936.0625, |
| "completions/min_length": 7082.0, |
| "completions/min_terminated_length": 7082.0, |
| "entropy": 0.25290792621672153, |
| "epoch": 0.7560975609756098, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3430319130420685, |
| "learning_rate": 6.319999999999999e-07, |
| "loss": 0.0229, |
| "num_tokens": 32026262.0, |
| "reward": 0.730975866317749, |
| "reward_std": 1.1369214057922363, |
| "rewards/reward_func/mean": 0.730975866317749, |
| "rewards/reward_func/std": 1.1369214057922363, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5900559425354004, |
| "sampling/importance_sampling_ratio/mean": 0.9913033843040466, |
| "sampling/importance_sampling_ratio/min": 0.24479100108146667, |
| "sampling/sampling_logp_difference/max": 1.4073505401611328, |
| "sampling/sampling_logp_difference/mean": 0.018287766724824905, |
| "step": 93, |
| "step_time": 343.9439448376652 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12133.0, |
| "completions/max_terminated_length": 12133.0, |
| "completions/mean_length": 9534.03125, |
| "completions/mean_terminated_length": 9534.03125, |
| "completions/min_length": 6151.0, |
| "completions/min_terminated_length": 6151.0, |
| "entropy": 0.2943458501249552, |
| "epoch": 0.7642276422764228, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45476409792900085, |
| "learning_rate": 6.28e-07, |
| "loss": -0.0413, |
| "num_tokens": 32348031.0, |
| "reward": 0.7935033440589905, |
| "reward_std": 0.5916282534599304, |
| "rewards/reward_func/mean": 0.7935033440589905, |
| "rewards/reward_func/std": 0.5916281938552856, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3106529712677, |
| "sampling/importance_sampling_ratio/mean": 0.9898567795753479, |
| "sampling/importance_sampling_ratio/min": 0.35712242126464844, |
| "sampling/sampling_logp_difference/max": 1.0296766757965088, |
| "sampling/sampling_logp_difference/mean": 0.020504558458924294, |
| "step": 94, |
| "step_time": 310.55569249019027 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13674.0, |
| "completions/max_terminated_length": 13674.0, |
| "completions/mean_length": 11145.9375, |
| "completions/mean_terminated_length": 11145.9375, |
| "completions/min_length": 7647.0, |
| "completions/min_terminated_length": 7647.0, |
| "entropy": 0.25408192910254, |
| "epoch": 0.7723577235772358, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3503531813621521, |
| "learning_rate": 6.24e-07, |
| "loss": -0.0358, |
| "num_tokens": 32743661.0, |
| "reward": 1.0657795667648315, |
| "reward_std": 0.5564767122268677, |
| "rewards/reward_func/mean": 1.0657795667648315, |
| "rewards/reward_func/std": 0.5564767122268677, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9909650087356567, |
| "sampling/importance_sampling_ratio/min": 0.024863438680768013, |
| "sampling/sampling_logp_difference/max": 3.694356918334961, |
| "sampling/sampling_logp_difference/mean": 0.018509428948163986, |
| "step": 95, |
| "step_time": 504.7223396820482 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12207.0, |
| "completions/max_terminated_length": 12207.0, |
| "completions/mean_length": 7209.34375, |
| "completions/mean_terminated_length": 7209.34375, |
| "completions/min_length": 3655.0, |
| "completions/min_terminated_length": 3655.0, |
| "entropy": 0.2981985919177532, |
| "epoch": 0.7804878048780488, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.4046306014060974, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0208, |
| "num_tokens": 32986864.0, |
| "reward": 1.0635581016540527, |
| "reward_std": 0.6817582249641418, |
| "rewards/reward_func/mean": 1.0635581016540527, |
| "rewards/reward_func/std": 0.6817582845687866, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9894931316375732, |
| "sampling/importance_sampling_ratio/min": 0.20488663017749786, |
| "sampling/sampling_logp_difference/max": 1.5852985382080078, |
| "sampling/sampling_logp_difference/mean": 0.020824309438467026, |
| "step": 96, |
| "step_time": 286.0661023929715 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14730.0, |
| "completions/max_terminated_length": 14730.0, |
| "completions/mean_length": 10380.71875, |
| "completions/mean_terminated_length": 10380.71875, |
| "completions/min_length": 5851.0, |
| "completions/min_terminated_length": 5851.0, |
| "entropy": 0.27193080354481936, |
| "epoch": 0.7886178861788617, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.32370734214782715, |
| "learning_rate": 6.16e-07, |
| "loss": 0.0166, |
| "num_tokens": 33339135.0, |
| "reward": 0.4628852605819702, |
| "reward_std": 0.697974443435669, |
| "rewards/reward_func/mean": 0.4628852605819702, |
| "rewards/reward_func/std": 0.6979743838310242, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9995118379592896, |
| "sampling/importance_sampling_ratio/mean": 0.990587592124939, |
| "sampling/importance_sampling_ratio/min": 0.3680788576602936, |
| "sampling/sampling_logp_difference/max": 0.9994580745697021, |
| "sampling/sampling_logp_difference/mean": 0.019620057195425034, |
| "step": 97, |
| "step_time": 386.9268953008577 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17358.0, |
| "completions/max_terminated_length": 17358.0, |
| "completions/mean_length": 10972.78125, |
| "completions/mean_terminated_length": 10972.78125, |
| "completions/min_length": 6301.0, |
| "completions/min_terminated_length": 6301.0, |
| "entropy": 0.2722749076783657, |
| "epoch": 0.7967479674796748, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38124802708625793, |
| "learning_rate": 6.119999999999999e-07, |
| "loss": -0.0324, |
| "num_tokens": 33703832.0, |
| "reward": 3.1216416358947754, |
| "reward_std": 4.944037437438965, |
| "rewards/reward_func/mean": 3.129072666168213, |
| "rewards/reward_func/std": 4.940955638885498, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0074310302734375, |
| "rewards/soft_overlong_punishment_reward/std": 0.042036253958940506, |
| "sampling/importance_sampling_ratio/max": 2.931342124938965, |
| "sampling/importance_sampling_ratio/mean": 0.9906518459320068, |
| "sampling/importance_sampling_ratio/min": 0.14344006776809692, |
| "sampling/sampling_logp_difference/max": 1.941838026046753, |
| "sampling/sampling_logp_difference/mean": 0.01964397355914116, |
| "step": 98, |
| "step_time": 387.24955694633536 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17372.0, |
| "completions/max_terminated_length": 17372.0, |
| "completions/mean_length": 11532.4375, |
| "completions/mean_terminated_length": 11532.4375, |
| "completions/min_length": 8395.0, |
| "completions/min_terminated_length": 8395.0, |
| "entropy": 0.2691387142986059, |
| "epoch": 0.8048780487804879, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3799015283584595, |
| "learning_rate": 6.079999999999999e-07, |
| "loss": 0.0257, |
| "num_tokens": 34090550.0, |
| "reward": 0.8006877899169922, |
| "reward_std": 0.6781271696090698, |
| "rewards/reward_func/mean": 0.8082256317138672, |
| "rewards/reward_func/std": 0.6674283742904663, |
| "rewards/soft_overlong_punishment_reward/mean": -0.007537841796875, |
| "rewards/soft_overlong_punishment_reward/std": 0.04264046996831894, |
| "sampling/importance_sampling_ratio/max": 2.300957202911377, |
| "sampling/importance_sampling_ratio/mean": 0.9906717538833618, |
| "sampling/importance_sampling_ratio/min": 0.20086948573589325, |
| "sampling/sampling_logp_difference/max": 1.6050999164581299, |
| "sampling/sampling_logp_difference/mean": 0.019317321479320526, |
| "step": 99, |
| "step_time": 392.5223165056668 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12911.0, |
| "completions/max_terminated_length": 12911.0, |
| "completions/mean_length": 9746.75, |
| "completions/mean_terminated_length": 9746.75, |
| "completions/min_length": 6072.0, |
| "completions/min_terminated_length": 6072.0, |
| "entropy": 0.2797823455184698, |
| "epoch": 0.8130081300813008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40460583567619324, |
| "learning_rate": 6.04e-07, |
| "loss": 0.029, |
| "num_tokens": 34424134.0, |
| "reward": 0.8509798049926758, |
| "reward_std": 0.6565719842910767, |
| "rewards/reward_func/mean": 0.8509798049926758, |
| "rewards/reward_func/std": 0.6565720438957214, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1567296981811523, |
| "sampling/importance_sampling_ratio/mean": 0.9903455376625061, |
| "sampling/importance_sampling_ratio/min": 0.04983352869749069, |
| "sampling/sampling_logp_difference/max": 2.9990673065185547, |
| "sampling/sampling_logp_difference/mean": 0.020120201632380486, |
| "step": 100, |
| "step_time": 411.4386176103726 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14230.0, |
| "completions/max_terminated_length": 14230.0, |
| "completions/mean_length": 9372.625, |
| "completions/mean_terminated_length": 9372.625, |
| "completions/min_length": 6478.0, |
| "completions/min_terminated_length": 6478.0, |
| "entropy": 0.2752824854105711, |
| "epoch": 0.8211382113821138, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42639046907424927, |
| "learning_rate": 6e-07, |
| "loss": 0.0453, |
| "num_tokens": 34742706.0, |
| "reward": 0.8862287998199463, |
| "reward_std": 0.5651472806930542, |
| "rewards/reward_func/mean": 0.8862287998199463, |
| "rewards/reward_func/std": 0.5651472806930542, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2520346641540527, |
| "sampling/importance_sampling_ratio/mean": 0.9904347062110901, |
| "sampling/importance_sampling_ratio/min": 0.2951987087726593, |
| "sampling/sampling_logp_difference/max": 1.220106601715088, |
| "sampling/sampling_logp_difference/mean": 0.019315559417009354, |
| "step": 101, |
| "step_time": 487.11677971179597 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14193.0, |
| "completions/max_terminated_length": 14193.0, |
| "completions/mean_length": 9737.625, |
| "completions/mean_terminated_length": 9737.625, |
| "completions/min_length": 6748.0, |
| "completions/min_terminated_length": 6748.0, |
| "entropy": 0.26770962681621313, |
| "epoch": 0.8292682926829268, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40013840794563293, |
| "learning_rate": 5.96e-07, |
| "loss": 0.0546, |
| "num_tokens": 35072926.0, |
| "reward": 0.8616600036621094, |
| "reward_std": 0.6439077854156494, |
| "rewards/reward_func/mean": 0.8616600036621094, |
| "rewards/reward_func/std": 0.6439077258110046, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906123876571655, |
| "sampling/importance_sampling_ratio/min": 0.08764204382896423, |
| "sampling/sampling_logp_difference/max": 2.4344944953918457, |
| "sampling/sampling_logp_difference/mean": 0.019352490082383156, |
| "step": 102, |
| "step_time": 527.143631025916 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13535.0, |
| "completions/max_terminated_length": 13535.0, |
| "completions/mean_length": 7765.5, |
| "completions/mean_terminated_length": 7765.5, |
| "completions/min_length": 2947.0, |
| "completions/min_terminated_length": 2947.0, |
| "entropy": 0.2951889578253031, |
| "epoch": 0.8373983739837398, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.438553124666214, |
| "learning_rate": 5.919999999999999e-07, |
| "loss": 0.0248, |
| "num_tokens": 35336830.0, |
| "reward": 1.7072114944458008, |
| "reward_std": 1.5981085300445557, |
| "rewards/reward_func/mean": 1.7072114944458008, |
| "rewards/reward_func/std": 1.5981085300445557, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.115924119949341, |
| "sampling/importance_sampling_ratio/mean": 0.9898935556411743, |
| "sampling/importance_sampling_ratio/min": 7.95746746007353e-06, |
| "sampling/sampling_logp_difference/max": 11.741399765014648, |
| "sampling/sampling_logp_difference/mean": 0.020254164934158325, |
| "step": 103, |
| "step_time": 370.2516266454477 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14842.0, |
| "completions/max_terminated_length": 14842.0, |
| "completions/mean_length": 10355.96875, |
| "completions/mean_terminated_length": 10355.96875, |
| "completions/min_length": 5380.0, |
| "completions/min_terminated_length": 5380.0, |
| "entropy": 0.27049592323601246, |
| "epoch": 0.8455284552845529, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4156341850757599, |
| "learning_rate": 5.879999999999999e-07, |
| "loss": 0.0127, |
| "num_tokens": 35693509.0, |
| "reward": 0.41826581954956055, |
| "reward_std": 0.4194912612438202, |
| "rewards/reward_func/mean": 0.41826581954956055, |
| "rewards/reward_func/std": 0.4194912910461426, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.567049264907837, |
| "sampling/importance_sampling_ratio/mean": 0.9908696413040161, |
| "sampling/importance_sampling_ratio/min": 0.14091628789901733, |
| "sampling/sampling_logp_difference/max": 1.9595892429351807, |
| "sampling/sampling_logp_difference/mean": 0.01946902647614479, |
| "step": 104, |
| "step_time": 353.2303258762695 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13809.0, |
| "completions/max_terminated_length": 13809.0, |
| "completions/mean_length": 9495.78125, |
| "completions/mean_terminated_length": 9495.78125, |
| "completions/min_length": 4356.0, |
| "completions/min_terminated_length": 4356.0, |
| "entropy": 0.2733620647341013, |
| "epoch": 0.8536585365853658, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4369182586669922, |
| "learning_rate": 5.839999999999999e-07, |
| "loss": -0.0393, |
| "num_tokens": 36021462.0, |
| "reward": 1.382319450378418, |
| "reward_std": 1.3517597913742065, |
| "rewards/reward_func/mean": 1.382319450378418, |
| "rewards/reward_func/std": 1.351759672164917, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902534484863281, |
| "sampling/importance_sampling_ratio/min": 0.12755648791790009, |
| "sampling/sampling_logp_difference/max": 2.0591959953308105, |
| "sampling/sampling_logp_difference/mean": 0.019751761108636856, |
| "step": 105, |
| "step_time": 376.45426351577044 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16323.0, |
| "completions/max_terminated_length": 16323.0, |
| "completions/mean_length": 11590.21875, |
| "completions/mean_terminated_length": 11590.21875, |
| "completions/min_length": 7957.0, |
| "completions/min_terminated_length": 7957.0, |
| "entropy": 0.26938960794359446, |
| "epoch": 0.8617886178861789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3502255082130432, |
| "learning_rate": 5.8e-07, |
| "loss": -0.0654, |
| "num_tokens": 36413573.0, |
| "reward": 1.1353490352630615, |
| "reward_std": 0.9817598462104797, |
| "rewards/reward_func/mean": 1.1353490352630615, |
| "rewards/reward_func/std": 0.9817598462104797, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.990548849105835, |
| "sampling/importance_sampling_ratio/min": 0.16546453535556793, |
| "sampling/sampling_logp_difference/max": 1.7989983558654785, |
| "sampling/sampling_logp_difference/mean": 0.0195661298930645, |
| "step": 106, |
| "step_time": 459.41654721833766 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12627.0, |
| "completions/max_terminated_length": 12627.0, |
| "completions/mean_length": 10500.6875, |
| "completions/mean_terminated_length": 10500.6875, |
| "completions/min_length": 7936.0, |
| "completions/min_terminated_length": 7936.0, |
| "entropy": 0.26135144662112, |
| "epoch": 0.8699186991869918, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3937532901763916, |
| "learning_rate": 5.76e-07, |
| "loss": -0.0177, |
| "num_tokens": 36769739.0, |
| "reward": 0.8728915452957153, |
| "reward_std": 0.605887770652771, |
| "rewards/reward_func/mean": 0.8728915452957153, |
| "rewards/reward_func/std": 0.6058877110481262, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5900862216949463, |
| "sampling/importance_sampling_ratio/mean": 0.9908682107925415, |
| "sampling/importance_sampling_ratio/min": 0.16999557614326477, |
| "sampling/sampling_logp_difference/max": 1.7719829082489014, |
| "sampling/sampling_logp_difference/mean": 0.018946364521980286, |
| "step": 107, |
| "step_time": 383.9537097290158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13026.0, |
| "completions/max_terminated_length": 13026.0, |
| "completions/mean_length": 8727.03125, |
| "completions/mean_terminated_length": 8727.03125, |
| "completions/min_length": 3240.0, |
| "completions/min_terminated_length": 3240.0, |
| "entropy": 0.28625404462218285, |
| "epoch": 0.8780487804878049, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4003085792064667, |
| "learning_rate": 5.719999999999999e-07, |
| "loss": -0.0317, |
| "num_tokens": 37067684.0, |
| "reward": 0.8368265628814697, |
| "reward_std": 0.700191855430603, |
| "rewards/reward_func/mean": 0.8368265628814697, |
| "rewards/reward_func/std": 0.7001917958259583, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3381805419921875, |
| "sampling/importance_sampling_ratio/mean": 0.990260124206543, |
| "sampling/importance_sampling_ratio/min": 0.3622949719429016, |
| "sampling/sampling_logp_difference/max": 1.0152965784072876, |
| "sampling/sampling_logp_difference/mean": 0.01966564916074276, |
| "step": 108, |
| "step_time": 342.13271795446053 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15831.0, |
| "completions/max_terminated_length": 15831.0, |
| "completions/mean_length": 9779.71875, |
| "completions/mean_terminated_length": 9779.71875, |
| "completions/min_length": 4923.0, |
| "completions/min_terminated_length": 4923.0, |
| "entropy": 0.2904251739382744, |
| "epoch": 0.8861788617886179, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3797857463359833, |
| "learning_rate": 5.679999999999999e-07, |
| "loss": -0.0324, |
| "num_tokens": 37391699.0, |
| "reward": 0.8686186671257019, |
| "reward_std": 1.0430734157562256, |
| "rewards/reward_func/mean": 0.8686186671257019, |
| "rewards/reward_func/std": 1.0430734157562256, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.62477970123291, |
| "sampling/importance_sampling_ratio/mean": 0.9900540113449097, |
| "sampling/importance_sampling_ratio/min": 0.09236539900302887, |
| "sampling/sampling_logp_difference/max": 2.382002830505371, |
| "sampling/sampling_logp_difference/mean": 0.02043168991804123, |
| "step": 109, |
| "step_time": 490.61023391690105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16567.0, |
| "completions/max_terminated_length": 16567.0, |
| "completions/mean_length": 10196.96875, |
| "completions/mean_terminated_length": 10196.96875, |
| "completions/min_length": 3045.0, |
| "completions/min_terminated_length": 3045.0, |
| "entropy": 0.27857582084834576, |
| "epoch": 0.8943089430894309, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6588975191116333, |
| "learning_rate": 5.639999999999999e-07, |
| "loss": 0.0584, |
| "num_tokens": 37735962.0, |
| "reward": 0.9283621907234192, |
| "reward_std": 0.6262274384498596, |
| "rewards/reward_func/mean": 0.9300482869148254, |
| "rewards/reward_func/std": 0.6267218589782715, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00168609619140625, |
| "rewards/soft_overlong_punishment_reward/std": 0.008014494553208351, |
| "sampling/importance_sampling_ratio/max": 2.244232654571533, |
| "sampling/importance_sampling_ratio/mean": 0.9905369281768799, |
| "sampling/importance_sampling_ratio/min": 0.13296444714069366, |
| "sampling/sampling_logp_difference/max": 2.0176734924316406, |
| "sampling/sampling_logp_difference/mean": 0.019462019205093384, |
| "step": 110, |
| "step_time": 402.9467481090687 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15759.0, |
| "completions/max_terminated_length": 15759.0, |
| "completions/mean_length": 10804.0625, |
| "completions/mean_terminated_length": 10804.0625, |
| "completions/min_length": 7114.0, |
| "completions/min_terminated_length": 7114.0, |
| "entropy": 0.2655584989115596, |
| "epoch": 0.9024390243902439, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3911796808242798, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0368, |
| "num_tokens": 38108716.0, |
| "reward": 0.9208776950836182, |
| "reward_std": 0.6002487540245056, |
| "rewards/reward_func/mean": 0.9208776950836182, |
| "rewards/reward_func/std": 0.6002486944198608, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9908932447433472, |
| "sampling/importance_sampling_ratio/min": 0.11072701215744019, |
| "sampling/sampling_logp_difference/max": 2.2006874084472656, |
| "sampling/sampling_logp_difference/mean": 0.018912389874458313, |
| "step": 111, |
| "step_time": 407.13406765437685 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12699.0, |
| "completions/max_terminated_length": 12699.0, |
| "completions/mean_length": 9181.875, |
| "completions/mean_terminated_length": 9181.875, |
| "completions/min_length": 5314.0, |
| "completions/min_terminated_length": 5314.0, |
| "entropy": 0.30285687930881977, |
| "epoch": 0.9105691056910569, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3971967101097107, |
| "learning_rate": 5.560000000000001e-07, |
| "loss": 0.0194, |
| "num_tokens": 38414432.0, |
| "reward": 0.7858570218086243, |
| "reward_std": 0.641828179359436, |
| "rewards/reward_func/mean": 0.7858570218086243, |
| "rewards/reward_func/std": 0.641828179359436, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0928192138671875, |
| "sampling/importance_sampling_ratio/mean": 0.989500105381012, |
| "sampling/importance_sampling_ratio/min": 0.3735429346561432, |
| "sampling/sampling_logp_difference/max": 0.984722375869751, |
| "sampling/sampling_logp_difference/mean": 0.021352462470531464, |
| "step": 112, |
| "step_time": 310.6167916948907 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12846.0, |
| "completions/max_terminated_length": 12846.0, |
| "completions/mean_length": 9087.71875, |
| "completions/mean_terminated_length": 9087.71875, |
| "completions/min_length": 2983.0, |
| "completions/min_terminated_length": 2983.0, |
| "entropy": 0.2819946352392435, |
| "epoch": 0.9186991869918699, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3225323557853699, |
| "learning_rate": 5.520000000000001e-07, |
| "loss": -0.0178, |
| "num_tokens": 38724663.0, |
| "reward": 1.5539352893829346, |
| "reward_std": 1.4159839153289795, |
| "rewards/reward_func/mean": 1.5539352893829346, |
| "rewards/reward_func/std": 1.4159839153289795, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9653639793395996, |
| "sampling/importance_sampling_ratio/mean": 0.9901857376098633, |
| "sampling/importance_sampling_ratio/min": 0.2668585479259491, |
| "sampling/sampling_logp_difference/max": 1.3210365772247314, |
| "sampling/sampling_logp_difference/mean": 0.019712798297405243, |
| "step": 113, |
| "step_time": 342.8124888394959 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13271.0, |
| "completions/max_terminated_length": 13271.0, |
| "completions/mean_length": 7762.90625, |
| "completions/mean_terminated_length": 7762.90625, |
| "completions/min_length": 3438.0, |
| "completions/min_terminated_length": 3438.0, |
| "entropy": 0.3000789564102888, |
| "epoch": 0.926829268292683, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4473620355129242, |
| "learning_rate": 5.48e-07, |
| "loss": 0.0414, |
| "num_tokens": 38988212.0, |
| "reward": 1.8438094854354858, |
| "reward_std": 1.0651018619537354, |
| "rewards/reward_func/mean": 1.8438094854354858, |
| "rewards/reward_func/std": 1.065101981163025, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1569113731384277, |
| "sampling/importance_sampling_ratio/mean": 0.989720344543457, |
| "sampling/importance_sampling_ratio/min": 0.4092179238796234, |
| "sampling/sampling_logp_difference/max": 0.8935074806213379, |
| "sampling/sampling_logp_difference/mean": 0.020644046366214752, |
| "step": 114, |
| "step_time": 314.952937441878 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16884.0, |
| "completions/max_terminated_length": 16884.0, |
| "completions/mean_length": 9699.75, |
| "completions/mean_terminated_length": 9699.75, |
| "completions/min_length": 4351.0, |
| "completions/min_terminated_length": 4351.0, |
| "entropy": 0.2816589046269655, |
| "epoch": 0.9349593495934959, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3920203149318695, |
| "learning_rate": 5.44e-07, |
| "loss": 0.0309, |
| "num_tokens": 39317620.0, |
| "reward": 1.1251944303512573, |
| "reward_std": 0.5114659667015076, |
| "rewards/reward_func/mean": 1.1298178434371948, |
| "rewards/reward_func/std": 0.5025987029075623, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0046234130859375, |
| "rewards/soft_overlong_punishment_reward/std": 0.021913941949605942, |
| "sampling/importance_sampling_ratio/max": 2.1895768642425537, |
| "sampling/importance_sampling_ratio/mean": 0.990413248538971, |
| "sampling/importance_sampling_ratio/min": 0.036309029906988144, |
| "sampling/sampling_logp_difference/max": 3.3156888484954834, |
| "sampling/sampling_logp_difference/mean": 0.019514720886945724, |
| "step": 115, |
| "step_time": 596.8481385947671 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13522.0, |
| "completions/max_terminated_length": 13522.0, |
| "completions/mean_length": 10156.0, |
| "completions/mean_terminated_length": 10156.0, |
| "completions/min_length": 8118.0, |
| "completions/min_terminated_length": 8118.0, |
| "entropy": 0.2807458247989416, |
| "epoch": 0.943089430894309, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39652687311172485, |
| "learning_rate": 5.4e-07, |
| "loss": 0.0079, |
| "num_tokens": 39666956.0, |
| "reward": 1.032971739768982, |
| "reward_std": 0.5176628232002258, |
| "rewards/reward_func/mean": 1.032971739768982, |
| "rewards/reward_func/std": 0.5176628232002258, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.300656318664551, |
| "sampling/importance_sampling_ratio/mean": 0.9902166128158569, |
| "sampling/importance_sampling_ratio/min": 0.06670020520687103, |
| "sampling/sampling_logp_difference/max": 2.707547187805176, |
| "sampling/sampling_logp_difference/mean": 0.020113978534936905, |
| "step": 116, |
| "step_time": 329.7457778744865 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 18599.0, |
| "completions/max_terminated_length": 18599.0, |
| "completions/mean_length": 10023.34375, |
| "completions/mean_terminated_length": 10023.34375, |
| "completions/min_length": 3017.0, |
| "completions/min_terminated_length": 3017.0, |
| "entropy": 0.29004107043147087, |
| "epoch": 0.9512195121951219, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3823752701282501, |
| "learning_rate": 5.36e-07, |
| "loss": 0.025, |
| "num_tokens": 40002903.0, |
| "reward": 1.0167418718338013, |
| "reward_std": 0.5728766322135925, |
| "rewards/reward_func/mean": 1.03364098072052, |
| "rewards/reward_func/std": 0.5336021780967712, |
| "rewards/soft_overlong_punishment_reward/mean": -0.01689910888671875, |
| "rewards/soft_overlong_punishment_reward/std": 0.09559579938650131, |
| "sampling/importance_sampling_ratio/max": 2.599911689758301, |
| "sampling/importance_sampling_ratio/mean": 0.9898045063018799, |
| "sampling/importance_sampling_ratio/min": 0.2843405604362488, |
| "sampling/sampling_logp_difference/max": 1.257582664489746, |
| "sampling/sampling_logp_difference/mean": 0.020362723618745804, |
| "step": 117, |
| "step_time": 400.2332091778517 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 20165.0, |
| "completions/max_terminated_length": 20165.0, |
| "completions/mean_length": 11463.84375, |
| "completions/mean_terminated_length": 11463.84375, |
| "completions/min_length": 6103.0, |
| "completions/min_terminated_length": 6103.0, |
| "entropy": 0.24839603807777166, |
| "epoch": 0.959349593495935, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36443084478378296, |
| "learning_rate": 5.32e-07, |
| "loss": 0.0414, |
| "num_tokens": 40388106.0, |
| "reward": 1.3802266120910645, |
| "reward_std": 1.3856797218322754, |
| "rewards/reward_func/mean": 1.4090733528137207, |
| "rewards/reward_func/std": 1.3463064432144165, |
| "rewards/soft_overlong_punishment_reward/mean": -0.02884674072265625, |
| "rewards/soft_overlong_punishment_reward/std": 0.16318179666996002, |
| "sampling/importance_sampling_ratio/max": 2.2451746463775635, |
| "sampling/importance_sampling_ratio/mean": 0.9913095235824585, |
| "sampling/importance_sampling_ratio/min": 0.05133594200015068, |
| "sampling/sampling_logp_difference/max": 2.9693641662597656, |
| "sampling/sampling_logp_difference/mean": 0.018298882991075516, |
| "step": 118, |
| "step_time": 442.66103106434457 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 11502.0, |
| "completions/max_terminated_length": 11502.0, |
| "completions/mean_length": 9009.84375, |
| "completions/mean_terminated_length": 9009.84375, |
| "completions/min_length": 5196.0, |
| "completions/min_terminated_length": 5196.0, |
| "entropy": 0.3013112135231495, |
| "epoch": 0.967479674796748, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4052444398403168, |
| "learning_rate": 5.28e-07, |
| "loss": 0.0246, |
| "num_tokens": 40690197.0, |
| "reward": 2.287106513977051, |
| "reward_std": 3.716765880584717, |
| "rewards/reward_func/mean": 2.287106513977051, |
| "rewards/reward_func/std": 3.7167656421661377, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.6317532062530518, |
| "sampling/importance_sampling_ratio/mean": 0.989405632019043, |
| "sampling/importance_sampling_ratio/min": 0.2487541288137436, |
| "sampling/sampling_logp_difference/max": 1.391290307044983, |
| "sampling/sampling_logp_difference/mean": 0.02114209532737732, |
| "step": 119, |
| "step_time": 497.79962878953665 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15640.0, |
| "completions/max_terminated_length": 15640.0, |
| "completions/mean_length": 11166.90625, |
| "completions/mean_terminated_length": 11166.90625, |
| "completions/min_length": 7151.0, |
| "completions/min_terminated_length": 7151.0, |
| "entropy": 0.24014894664287567, |
| "epoch": 0.975609756097561, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3485146760940552, |
| "learning_rate": 5.24e-07, |
| "loss": -0.0227, |
| "num_tokens": 41079202.0, |
| "reward": 1.5909485816955566, |
| "reward_std": 2.074122667312622, |
| "rewards/reward_func/mean": 1.5909485816955566, |
| "rewards/reward_func/std": 2.074122667312622, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.659359931945801, |
| "sampling/importance_sampling_ratio/mean": 0.991703987121582, |
| "sampling/importance_sampling_ratio/min": 0.27526989579200745, |
| "sampling/sampling_logp_difference/max": 1.2900032997131348, |
| "sampling/sampling_logp_difference/mean": 0.017743926495313644, |
| "step": 120, |
| "step_time": 416.90072300843894 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 12728.0, |
| "completions/mean_length": 9105.25, |
| "completions/mean_terminated_length": 8738.322265625, |
| "completions/min_length": 5002.0, |
| "completions/min_terminated_length": 5002.0, |
| "entropy": 0.28143354691565037, |
| "epoch": 0.983739837398374, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39125847816467285, |
| "learning_rate": 5.2e-07, |
| "loss": -0.049, |
| "num_tokens": 41386626.0, |
| "reward": 1.0522152185440063, |
| "reward_std": 0.7879756093025208, |
| "rewards/reward_func/mean": 1.1184157133102417, |
| "rewards/reward_func/std": 0.7047606706619263, |
| "rewards/soft_overlong_punishment_reward/mean": -0.03125, |
| "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, |
| "sampling/importance_sampling_ratio/max": 2.605196475982666, |
| "sampling/importance_sampling_ratio/mean": 0.9898760318756104, |
| "sampling/importance_sampling_ratio/min": 0.12439469248056412, |
| "sampling/sampling_logp_difference/max": 2.0842957496643066, |
| "sampling/sampling_logp_difference/mean": 0.020777855068445206, |
| "step": 121, |
| "step_time": 394.79837024072185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13989.0, |
| "completions/max_terminated_length": 13989.0, |
| "completions/mean_length": 9587.40625, |
| "completions/mean_terminated_length": 9587.40625, |
| "completions/min_length": 6042.0, |
| "completions/min_terminated_length": 6042.0, |
| "entropy": 0.2911383733153343, |
| "epoch": 0.991869918699187, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40522804856300354, |
| "learning_rate": 5.16e-07, |
| "loss": 0.0389, |
| "num_tokens": 41711359.0, |
| "reward": 2.1636319160461426, |
| "reward_std": 2.066298246383667, |
| "rewards/reward_func/mean": 2.1636319160461426, |
| "rewards/reward_func/std": 2.066298007965088, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1135575771331787, |
| "sampling/importance_sampling_ratio/mean": 0.9899325370788574, |
| "sampling/importance_sampling_ratio/min": 0.3981960415840149, |
| "sampling/sampling_logp_difference/max": 0.9208108186721802, |
| "sampling/sampling_logp_difference/mean": 0.02073751576244831, |
| "step": 122, |
| "step_time": 348.1325803932268 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14597.0, |
| "completions/max_terminated_length": 14597.0, |
| "completions/mean_length": 9709.09375, |
| "completions/mean_terminated_length": 9709.09375, |
| "completions/min_length": 4628.0, |
| "completions/min_terminated_length": 4628.0, |
| "entropy": 0.2827394837513566, |
| "epoch": 1.0, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3837268650531769, |
| "learning_rate": 5.12e-07, |
| "loss": 0.0102, |
| "num_tokens": 42045618.0, |
| "reward": 1.2427458763122559, |
| "reward_std": 0.4029768705368042, |
| "rewards/reward_func/mean": 1.2427458763122559, |
| "rewards/reward_func/std": 0.4029768705368042, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902385473251343, |
| "sampling/importance_sampling_ratio/min": 0.26592302322387695, |
| "sampling/sampling_logp_difference/max": 1.5374226570129395, |
| "sampling/sampling_logp_difference/mean": 0.01998906210064888, |
| "step": 123, |
| "step_time": 447.3557674009353 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 10975.0, |
| "completions/max_terminated_length": 10975.0, |
| "completions/mean_length": 6729.0, |
| "completions/mean_terminated_length": 6729.0, |
| "completions/min_length": 3134.0, |
| "completions/min_terminated_length": 3134.0, |
| "entropy": 0.30727832205593586, |
| "epoch": 1.008130081300813, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5060835480690002, |
| "learning_rate": 5.079999999999999e-07, |
| "loss": -0.0037, |
| "num_tokens": 42272234.0, |
| "reward": 2.10992693901062, |
| "reward_std": 1.1074674129486084, |
| "rewards/reward_func/mean": 2.10992693901062, |
| "rewards/reward_func/std": 1.1074674129486084, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.500887155532837, |
| "sampling/importance_sampling_ratio/mean": 0.9892988204956055, |
| "sampling/importance_sampling_ratio/min": 1.925503283928265e-06, |
| "sampling/sampling_logp_difference/max": 13.160323143005371, |
| "sampling/sampling_logp_difference/mean": 0.021586474031209946, |
| "step": 124, |
| "step_time": 272.6258725624066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 11174.0, |
| "completions/max_terminated_length": 11174.0, |
| "completions/mean_length": 8467.71875, |
| "completions/mean_terminated_length": 8467.71875, |
| "completions/min_length": 6445.0, |
| "completions/min_terminated_length": 6445.0, |
| "entropy": 0.2736167572438717, |
| "epoch": 1.016260162601626, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4135052263736725, |
| "learning_rate": 5.04e-07, |
| "loss": -0.035, |
| "num_tokens": 42569737.0, |
| "reward": 1.3765490055084229, |
| "reward_std": 0.8696650862693787, |
| "rewards/reward_func/mean": 1.3765490055084229, |
| "rewards/reward_func/std": 0.8696651458740234, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3245065212249756, |
| "sampling/importance_sampling_ratio/mean": 0.990541934967041, |
| "sampling/importance_sampling_ratio/min": 0.31107988953590393, |
| "sampling/sampling_logp_difference/max": 1.1677055358886719, |
| "sampling/sampling_logp_difference/mean": 0.019098468124866486, |
| "step": 125, |
| "step_time": 385.7312441198155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 11560.0, |
| "completions/max_terminated_length": 11560.0, |
| "completions/mean_length": 8247.96875, |
| "completions/mean_terminated_length": 8247.96875, |
| "completions/min_length": 6429.0, |
| "completions/min_terminated_length": 6429.0, |
| "entropy": 0.30573431588709354, |
| "epoch": 1.024390243902439, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.6381073594093323, |
| "learning_rate": 5e-07, |
| "loss": -0.0225, |
| "num_tokens": 42846160.0, |
| "reward": 1.529063105583191, |
| "reward_std": 0.6035394668579102, |
| "rewards/reward_func/mean": 1.529063105583191, |
| "rewards/reward_func/std": 0.6035395264625549, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.874021291732788, |
| "sampling/importance_sampling_ratio/mean": 0.9894424676895142, |
| "sampling/importance_sampling_ratio/min": 5.591235822066665e-05, |
| "sampling/sampling_logp_difference/max": 9.791725158691406, |
| "sampling/sampling_logp_difference/mean": 0.021437201648950577, |
| "step": 126, |
| "step_time": 374.45182742597535 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16217.0, |
| "completions/max_terminated_length": 16217.0, |
| "completions/mean_length": 9949.71875, |
| "completions/mean_terminated_length": 9949.71875, |
| "completions/min_length": 4652.0, |
| "completions/min_terminated_length": 4652.0, |
| "entropy": 0.31133372336626053, |
| "epoch": 1.032520325203252, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39641663432121277, |
| "learning_rate": 4.96e-07, |
| "loss": 0.0177, |
| "num_tokens": 43176799.0, |
| "reward": 1.072230577468872, |
| "reward_std": 0.6142371892929077, |
| "rewards/reward_func/mean": 1.072230577468872, |
| "rewards/reward_func/std": 0.6142371892929077, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1118323802948, |
| "sampling/importance_sampling_ratio/mean": 0.989276647567749, |
| "sampling/importance_sampling_ratio/min": 0.3954460620880127, |
| "sampling/sampling_logp_difference/max": 0.9277408719062805, |
| "sampling/sampling_logp_difference/mean": 0.021818656474351883, |
| "step": 127, |
| "step_time": 381.2957224531565 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12746.0, |
| "completions/max_terminated_length": 12746.0, |
| "completions/mean_length": 9191.84375, |
| "completions/mean_terminated_length": 9191.84375, |
| "completions/min_length": 5782.0, |
| "completions/min_terminated_length": 5782.0, |
| "entropy": 0.30142006278038025, |
| "epoch": 1.040650406504065, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42082273960113525, |
| "learning_rate": 4.92e-07, |
| "loss": 0.0094, |
| "num_tokens": 43484650.0, |
| "reward": 1.3620415925979614, |
| "reward_std": 1.3542721271514893, |
| "rewards/reward_func/mean": 1.3620415925979614, |
| "rewards/reward_func/std": 1.3542720079421997, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0942885875701904, |
| "sampling/importance_sampling_ratio/mean": 0.9896490573883057, |
| "sampling/importance_sampling_ratio/min": 0.2504059672355652, |
| "sampling/sampling_logp_difference/max": 1.3846718072891235, |
| "sampling/sampling_logp_difference/mean": 0.020984603092074394, |
| "step": 128, |
| "step_time": 440.56851464207284 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14303.0, |
| "completions/max_terminated_length": 14303.0, |
| "completions/mean_length": 9928.21875, |
| "completions/mean_terminated_length": 9928.21875, |
| "completions/min_length": 6230.0, |
| "completions/min_terminated_length": 6230.0, |
| "entropy": 0.2877661660313606, |
| "epoch": 1.048780487804878, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39021143317222595, |
| "learning_rate": 4.879999999999999e-07, |
| "loss": 0.0127, |
| "num_tokens": 43820457.0, |
| "reward": 0.8574129343032837, |
| "reward_std": 0.572685182094574, |
| "rewards/reward_func/mean": 0.8574129343032837, |
| "rewards/reward_func/std": 0.5726851224899292, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1260764598846436, |
| "sampling/importance_sampling_ratio/mean": 0.9900418519973755, |
| "sampling/importance_sampling_ratio/min": 0.3720153272151947, |
| "sampling/sampling_logp_difference/max": 0.9888203144073486, |
| "sampling/sampling_logp_difference/mean": 0.02001476287841797, |
| "step": 129, |
| "step_time": 541.4112695821095 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15507.0, |
| "completions/max_terminated_length": 15507.0, |
| "completions/mean_length": 11027.96875, |
| "completions/mean_terminated_length": 11027.96875, |
| "completions/min_length": 6492.0, |
| "completions/min_terminated_length": 6492.0, |
| "entropy": 0.24912087991833687, |
| "epoch": 1.056910569105691, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3552826941013336, |
| "learning_rate": 4.839999999999999e-07, |
| "loss": 0.0134, |
| "num_tokens": 44211528.0, |
| "reward": 0.9818797707557678, |
| "reward_std": 0.6069151759147644, |
| "rewards/reward_func/mean": 0.9818797707557678, |
| "rewards/reward_func/std": 0.6069151163101196, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1495964527130127, |
| "sampling/importance_sampling_ratio/mean": 0.9914515614509583, |
| "sampling/importance_sampling_ratio/min": 0.2990056574344635, |
| "sampling/sampling_logp_difference/max": 1.2072927951812744, |
| "sampling/sampling_logp_difference/mean": 0.017646770924329758, |
| "step": 130, |
| "step_time": 481.262499995064 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17006.0, |
| "completions/max_terminated_length": 17006.0, |
| "completions/mean_length": 9569.5, |
| "completions/mean_terminated_length": 9569.5, |
| "completions/min_length": 3066.0, |
| "completions/min_terminated_length": 3066.0, |
| "entropy": 0.27375591546297073, |
| "epoch": 1.065040650406504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40961822867393494, |
| "learning_rate": 4.8e-07, |
| "loss": 0.0289, |
| "num_tokens": 44543752.0, |
| "reward": 1.1588225364685059, |
| "reward_std": 1.0106967687606812, |
| "rewards/reward_func/mean": 1.1635680198669434, |
| "rewards/reward_func/std": 1.0049266815185547, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0047454833984375, |
| "rewards/soft_overlong_punishment_reward/std": 0.026844507083296776, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9909744262695312, |
| "sampling/importance_sampling_ratio/min": 0.08428187668323517, |
| "sampling/sampling_logp_difference/max": 2.473588466644287, |
| "sampling/sampling_logp_difference/mean": 0.01833171769976616, |
| "step": 131, |
| "step_time": 601.2343452193309 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14444.0, |
| "completions/max_terminated_length": 14444.0, |
| "completions/mean_length": 11014.25, |
| "completions/mean_terminated_length": 11014.25, |
| "completions/min_length": 8324.0, |
| "completions/min_terminated_length": 8324.0, |
| "entropy": 0.2876242399215698, |
| "epoch": 1.0731707317073171, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4003022015094757, |
| "learning_rate": 4.76e-07, |
| "loss": -0.0106, |
| "num_tokens": 44918296.0, |
| "reward": 0.595725953578949, |
| "reward_std": 0.6117147207260132, |
| "rewards/reward_func/mean": 0.595725953578949, |
| "rewards/reward_func/std": 0.6117147207260132, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3590211868286133, |
| "sampling/importance_sampling_ratio/mean": 0.9899647235870361, |
| "sampling/importance_sampling_ratio/min": 0.177188441157341, |
| "sampling/sampling_logp_difference/max": 1.730541467666626, |
| "sampling/sampling_logp_difference/mean": 0.020228557288646698, |
| "step": 132, |
| "step_time": 359.82743262615986 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14076.0, |
| "completions/max_terminated_length": 14076.0, |
| "completions/mean_length": 10530.5, |
| "completions/mean_terminated_length": 10530.5, |
| "completions/min_length": 7319.0, |
| "completions/min_terminated_length": 7319.0, |
| "entropy": 0.28902580961585045, |
| "epoch": 1.08130081300813, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3696592152118683, |
| "learning_rate": 4.7199999999999994e-07, |
| "loss": 0.0044, |
| "num_tokens": 45273288.0, |
| "reward": 1.0774266719818115, |
| "reward_std": 0.7130836248397827, |
| "rewards/reward_func/mean": 1.0774266719818115, |
| "rewards/reward_func/std": 0.7130836248397827, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9898357391357422, |
| "sampling/importance_sampling_ratio/min": 0.2908354103565216, |
| "sampling/sampling_logp_difference/max": 1.9460086822509766, |
| "sampling/sampling_logp_difference/mean": 0.020634226500988007, |
| "step": 133, |
| "step_time": 376.208888650639 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13802.0, |
| "completions/max_terminated_length": 13802.0, |
| "completions/mean_length": 9979.875, |
| "completions/mean_terminated_length": 9979.875, |
| "completions/min_length": 5474.0, |
| "completions/min_terminated_length": 5474.0, |
| "entropy": 0.27061324566602707, |
| "epoch": 1.089430894308943, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3747713267803192, |
| "learning_rate": 4.68e-07, |
| "loss": 0.0181, |
| "num_tokens": 45618380.0, |
| "reward": 0.9764193296432495, |
| "reward_std": 0.5758468508720398, |
| "rewards/reward_func/mean": 0.9764193296432495, |
| "rewards/reward_func/std": 0.575846791267395, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906251430511475, |
| "sampling/importance_sampling_ratio/min": 0.396713525056839, |
| "sampling/sampling_logp_difference/max": 1.3333165645599365, |
| "sampling/sampling_logp_difference/mean": 0.01938489079475403, |
| "step": 134, |
| "step_time": 360.48224557284266 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16511.0, |
| "completions/max_terminated_length": 16511.0, |
| "completions/mean_length": 12193.5625, |
| "completions/mean_terminated_length": 12193.5625, |
| "completions/min_length": 7962.0, |
| "completions/min_terminated_length": 7962.0, |
| "entropy": 0.2715284349396825, |
| "epoch": 1.0975609756097562, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3628915250301361, |
| "learning_rate": 4.64e-07, |
| "loss": -0.0121, |
| "num_tokens": 46030086.0, |
| "reward": 0.60393226146698, |
| "reward_std": 0.7513862252235413, |
| "rewards/reward_func/mean": 0.6057404279708862, |
| "rewards/reward_func/std": 0.7498459815979004, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00180816650390625, |
| "rewards/soft_overlong_punishment_reward/std": 0.007134551648050547, |
| "sampling/importance_sampling_ratio/max": 2.5155422687530518, |
| "sampling/importance_sampling_ratio/mean": 0.9904724359512329, |
| "sampling/importance_sampling_ratio/min": 0.11712665855884552, |
| "sampling/sampling_logp_difference/max": 2.1444993019104004, |
| "sampling/sampling_logp_difference/mean": 0.019424114376306534, |
| "step": 135, |
| "step_time": 433.3370830931235 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 13810.0, |
| "completions/mean_length": 10165.46875, |
| "completions/mean_terminated_length": 9832.7412109375, |
| "completions/min_length": 5537.0, |
| "completions/min_terminated_length": 5537.0, |
| "entropy": 0.27419494558125734, |
| "epoch": 1.1056910569105691, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3413371443748474, |
| "learning_rate": 4.6e-07, |
| "loss": -0.0679, |
| "num_tokens": 46373613.0, |
| "reward": 1.0951216220855713, |
| "reward_std": 0.6765117049217224, |
| "rewards/reward_func/mean": 1.1627061367034912, |
| "rewards/reward_func/std": 0.5673498511314392, |
| "rewards/soft_overlong_punishment_reward/mean": -0.03125, |
| "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, |
| "sampling/importance_sampling_ratio/max": 2.397608518600464, |
| "sampling/importance_sampling_ratio/mean": 0.9900693297386169, |
| "sampling/importance_sampling_ratio/min": 0.03842584043741226, |
| "sampling/sampling_logp_difference/max": 3.2590250968933105, |
| "sampling/sampling_logp_difference/mean": 0.02013106644153595, |
| "step": 136, |
| "step_time": 417.4300327305682 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13043.0, |
| "completions/max_terminated_length": 13043.0, |
| "completions/mean_length": 9212.1875, |
| "completions/mean_terminated_length": 9212.1875, |
| "completions/min_length": 4419.0, |
| "completions/min_terminated_length": 4419.0, |
| "entropy": 0.32444891706109047, |
| "epoch": 1.113821138211382, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43532252311706543, |
| "learning_rate": 4.56e-07, |
| "loss": -0.0185, |
| "num_tokens": 46680027.0, |
| "reward": 0.972080647945404, |
| "reward_std": 0.6541093587875366, |
| "rewards/reward_func/mean": 0.972080647945404, |
| "rewards/reward_func/std": 0.6541092991828918, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.6352925300598145, |
| "sampling/importance_sampling_ratio/mean": 0.9887343645095825, |
| "sampling/importance_sampling_ratio/min": 0.02297734096646309, |
| "sampling/sampling_logp_difference/max": 3.7732467651367188, |
| "sampling/sampling_logp_difference/mean": 0.022742385044693947, |
| "step": 137, |
| "step_time": 321.7123435754329 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14211.0, |
| "completions/max_terminated_length": 14211.0, |
| "completions/mean_length": 9940.3125, |
| "completions/mean_terminated_length": 9940.3125, |
| "completions/min_length": 5922.0, |
| "completions/min_terminated_length": 5922.0, |
| "entropy": 0.2774552209302783, |
| "epoch": 1.1219512195121952, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41212156414985657, |
| "learning_rate": 4.5199999999999997e-07, |
| "loss": -0.0025, |
| "num_tokens": 47020813.0, |
| "reward": 0.8475947380065918, |
| "reward_std": 0.6312429308891296, |
| "rewards/reward_func/mean": 0.8475947380065918, |
| "rewards/reward_func/std": 0.6312428712844849, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.085104465484619, |
| "sampling/importance_sampling_ratio/mean": 0.9904999732971191, |
| "sampling/importance_sampling_ratio/min": 0.011845460161566734, |
| "sampling/sampling_logp_difference/max": 4.435810565948486, |
| "sampling/sampling_logp_difference/mean": 0.01984248496592045, |
| "step": 138, |
| "step_time": 367.3463532931637 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12646.0, |
| "completions/max_terminated_length": 12646.0, |
| "completions/mean_length": 9365.15625, |
| "completions/mean_terminated_length": 9365.15625, |
| "completions/min_length": 4490.0, |
| "completions/min_terminated_length": 4490.0, |
| "entropy": 0.30285973846912384, |
| "epoch": 1.1300813008130082, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.44502395391464233, |
| "learning_rate": 4.48e-07, |
| "loss": 0.0451, |
| "num_tokens": 47334066.0, |
| "reward": 1.4396097660064697, |
| "reward_std": 0.8284114599227905, |
| "rewards/reward_func/mean": 1.4396097660064697, |
| "rewards/reward_func/std": 0.8284114003181458, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0112507343292236, |
| "sampling/importance_sampling_ratio/mean": 0.9893834590911865, |
| "sampling/importance_sampling_ratio/min": 0.3616284430027008, |
| "sampling/sampling_logp_difference/max": 1.0171380043029785, |
| "sampling/sampling_logp_difference/mean": 0.021245401352643967, |
| "step": 139, |
| "step_time": 320.48481974727474 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15627.0, |
| "completions/max_terminated_length": 15627.0, |
| "completions/mean_length": 9530.0625, |
| "completions/mean_terminated_length": 9530.0625, |
| "completions/min_length": 2759.0, |
| "completions/min_terminated_length": 2759.0, |
| "entropy": 0.31202419102191925, |
| "epoch": 1.1382113821138211, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42070963978767395, |
| "learning_rate": 4.44e-07, |
| "loss": -0.0332, |
| "num_tokens": 47657948.0, |
| "reward": 0.9002124071121216, |
| "reward_std": 0.6398483514785767, |
| "rewards/reward_func/mean": 0.9002124071121216, |
| "rewards/reward_func/std": 0.6398482918739319, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.8821748495101929, |
| "sampling/importance_sampling_ratio/mean": 0.989189088344574, |
| "sampling/importance_sampling_ratio/min": 0.011666374281048775, |
| "sampling/sampling_logp_difference/max": 4.45104455947876, |
| "sampling/sampling_logp_difference/mean": 0.02178381383419037, |
| "step": 140, |
| "step_time": 358.79695148952305 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 20480.0, |
| "completions/max_terminated_length": 14658.0, |
| "completions/mean_length": 10385.03125, |
| "completions/mean_terminated_length": 10059.38671875, |
| "completions/min_length": 3841.0, |
| "completions/min_terminated_length": 3841.0, |
| "entropy": 0.29269325640052557, |
| "epoch": 1.146341463414634, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3514389991760254, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": -0.0493, |
| "num_tokens": 48003493.0, |
| "reward": 0.6447830200195312, |
| "reward_std": 0.72358638048172, |
| "rewards/reward_func/mean": 0.6760330200195312, |
| "rewards/reward_func/std": 0.6698598265647888, |
| "rewards/soft_overlong_punishment_reward/mean": -0.03125, |
| "rewards/soft_overlong_punishment_reward/std": 0.1767766922712326, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9895683526992798, |
| "sampling/importance_sampling_ratio/min": 0.37845414876937866, |
| "sampling/sampling_logp_difference/max": 1.1276581287384033, |
| "sampling/sampling_logp_difference/mean": 0.020915433764457703, |
| "step": 141, |
| "step_time": 402.56251001451164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13964.0, |
| "completions/max_terminated_length": 13964.0, |
| "completions/mean_length": 7764.46875, |
| "completions/mean_terminated_length": 7764.46875, |
| "completions/min_length": 3539.0, |
| "completions/min_terminated_length": 3539.0, |
| "entropy": 0.2780707832425833, |
| "epoch": 1.1544715447154472, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.44146835803985596, |
| "learning_rate": 4.36e-07, |
| "loss": 0.0156, |
| "num_tokens": 48265428.0, |
| "reward": 5.029313087463379, |
| "reward_std": 7.416443824768066, |
| "rewards/reward_func/mean": 5.029313087463379, |
| "rewards/reward_func/std": 7.41644287109375, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902679324150085, |
| "sampling/importance_sampling_ratio/min": 0.0015649524284526706, |
| "sampling/sampling_logp_difference/max": 6.45989990234375, |
| "sampling/sampling_logp_difference/mean": 0.019718850031495094, |
| "step": 142, |
| "step_time": 491.4185498545412 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15411.0, |
| "completions/max_terminated_length": 15411.0, |
| "completions/mean_length": 11449.90625, |
| "completions/mean_terminated_length": 11449.90625, |
| "completions/min_length": 7684.0, |
| "completions/min_terminated_length": 7684.0, |
| "entropy": 0.2600766168907285, |
| "epoch": 1.1626016260162602, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34485191106796265, |
| "learning_rate": 4.3199999999999995e-07, |
| "loss": 0.0211, |
| "num_tokens": 48654609.0, |
| "reward": 1.2525250911712646, |
| "reward_std": 0.9412046670913696, |
| "rewards/reward_func/mean": 1.2525250911712646, |
| "rewards/reward_func/std": 0.9412046670913696, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.541574716567993, |
| "sampling/importance_sampling_ratio/mean": 0.9909282326698303, |
| "sampling/importance_sampling_ratio/min": 0.34744107723236084, |
| "sampling/sampling_logp_difference/max": 1.0571601390838623, |
| "sampling/sampling_logp_difference/mean": 0.01887846551835537, |
| "step": 143, |
| "step_time": 605.705682055559 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15606.0, |
| "completions/max_terminated_length": 15606.0, |
| "completions/mean_length": 10148.96875, |
| "completions/mean_terminated_length": 10148.96875, |
| "completions/min_length": 6432.0, |
| "completions/min_terminated_length": 6432.0, |
| "entropy": 0.28575040213763714, |
| "epoch": 1.170731707317073, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3891197741031647, |
| "learning_rate": 4.2799999999999997e-07, |
| "loss": 0.0339, |
| "num_tokens": 48996144.0, |
| "reward": 2.4553003311157227, |
| "reward_std": 3.728713274002075, |
| "rewards/reward_func/mean": 2.4553003311157227, |
| "rewards/reward_func/std": 3.728713274002075, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901723861694336, |
| "sampling/importance_sampling_ratio/min": 0.02774185873568058, |
| "sampling/sampling_logp_difference/max": 3.584812879562378, |
| "sampling/sampling_logp_difference/mean": 0.02012861892580986, |
| "step": 144, |
| "step_time": 440.6931741584558 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14661.0, |
| "completions/max_terminated_length": 14661.0, |
| "completions/mean_length": 10413.6875, |
| "completions/mean_terminated_length": 10413.6875, |
| "completions/min_length": 5738.0, |
| "completions/min_terminated_length": 5738.0, |
| "entropy": 0.2827681256458163, |
| "epoch": 1.1788617886178863, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.398101806640625, |
| "learning_rate": 4.24e-07, |
| "loss": 0.0233, |
| "num_tokens": 49352246.0, |
| "reward": 0.9063984751701355, |
| "reward_std": 0.6884288787841797, |
| "rewards/reward_func/mean": 0.9063984751701355, |
| "rewards/reward_func/std": 0.6884288787841797, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.003222703933716, |
| "sampling/importance_sampling_ratio/mean": 0.9902200102806091, |
| "sampling/importance_sampling_ratio/min": 0.07585802674293518, |
| "sampling/sampling_logp_difference/max": 2.5788917541503906, |
| "sampling/sampling_logp_difference/mean": 0.0197792686522007, |
| "step": 145, |
| "step_time": 369.9440093538724 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13108.0, |
| "completions/max_terminated_length": 13108.0, |
| "completions/mean_length": 8945.3125, |
| "completions/mean_terminated_length": 8945.3125, |
| "completions/min_length": 4501.0, |
| "completions/min_terminated_length": 4501.0, |
| "entropy": 0.3082218300551176, |
| "epoch": 1.1869918699186992, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43946680426597595, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": -0.0153, |
| "num_tokens": 49652336.0, |
| "reward": 1.0096275806427002, |
| "reward_std": 0.6723037958145142, |
| "rewards/reward_func/mean": 1.0096275806427002, |
| "rewards/reward_func/std": 0.6723037958145142, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.288572072982788, |
| "sampling/importance_sampling_ratio/mean": 0.9894624948501587, |
| "sampling/importance_sampling_ratio/min": 2.8281038377440468e-18, |
| "sampling/sampling_logp_difference/max": 40.406925201416016, |
| "sampling/sampling_logp_difference/mean": 0.021884791553020477, |
| "step": 146, |
| "step_time": 309.9006433764007 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13444.0, |
| "completions/max_terminated_length": 13444.0, |
| "completions/mean_length": 9562.09375, |
| "completions/mean_terminated_length": 9562.09375, |
| "completions/min_length": 5406.0, |
| "completions/min_terminated_length": 5406.0, |
| "entropy": 0.29419814236462116, |
| "epoch": 1.1951219512195121, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41722893714904785, |
| "learning_rate": 4.1599999999999997e-07, |
| "loss": 0.0188, |
| "num_tokens": 49973571.0, |
| "reward": 1.3096396923065186, |
| "reward_std": 0.5881022214889526, |
| "rewards/reward_func/mean": 1.3096396923065186, |
| "rewards/reward_func/std": 0.5881022214889526, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.009949207305908, |
| "sampling/importance_sampling_ratio/mean": 0.9897855520248413, |
| "sampling/importance_sampling_ratio/min": 0.03852084279060364, |
| "sampling/sampling_logp_difference/max": 3.2565557956695557, |
| "sampling/sampling_logp_difference/mean": 0.020911747589707375, |
| "step": 147, |
| "step_time": 319.92451414023526 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14260.0, |
| "completions/max_terminated_length": 14260.0, |
| "completions/mean_length": 9389.1875, |
| "completions/mean_terminated_length": 9389.1875, |
| "completions/min_length": 4720.0, |
| "completions/min_terminated_length": 4720.0, |
| "entropy": 0.2792348377406597, |
| "epoch": 1.203252032520325, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4073498547077179, |
| "learning_rate": 4.12e-07, |
| "loss": -0.0508, |
| "num_tokens": 50295825.0, |
| "reward": 1.020705223083496, |
| "reward_std": 0.5470594167709351, |
| "rewards/reward_func/mean": 1.020705223083496, |
| "rewards/reward_func/std": 0.5470594167709351, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.9243781566619873, |
| "sampling/importance_sampling_ratio/mean": 0.9900591969490051, |
| "sampling/importance_sampling_ratio/min": 0.13670256733894348, |
| "sampling/sampling_logp_difference/max": 1.9899476766586304, |
| "sampling/sampling_logp_difference/mean": 0.02007705718278885, |
| "step": 148, |
| "step_time": 379.96521122637205 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12088.0, |
| "completions/max_terminated_length": 12088.0, |
| "completions/mean_length": 8509.34375, |
| "completions/mean_terminated_length": 8509.34375, |
| "completions/min_length": 5576.0, |
| "completions/min_terminated_length": 5576.0, |
| "entropy": 0.293182197958231, |
| "epoch": 1.2113821138211383, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41748881340026855, |
| "learning_rate": 4.0799999999999995e-07, |
| "loss": 0.0059, |
| "num_tokens": 50583228.0, |
| "reward": 1.2018049955368042, |
| "reward_std": 0.4176173210144043, |
| "rewards/reward_func/mean": 1.2018049955368042, |
| "rewards/reward_func/std": 0.4176173210144043, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9898430705070496, |
| "sampling/importance_sampling_ratio/min": 0.21498073637485504, |
| "sampling/sampling_logp_difference/max": 1.5372068881988525, |
| "sampling/sampling_logp_difference/mean": 0.02043253555893898, |
| "step": 149, |
| "step_time": 274.8332504169084 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13540.0, |
| "completions/max_terminated_length": 13540.0, |
| "completions/mean_length": 8832.40625, |
| "completions/mean_terminated_length": 8832.40625, |
| "completions/min_length": 5019.0, |
| "completions/min_terminated_length": 5019.0, |
| "entropy": 0.30714407935738564, |
| "epoch": 1.2195121951219512, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4428982138633728, |
| "learning_rate": 4.04e-07, |
| "loss": -0.0408, |
| "num_tokens": 50876777.0, |
| "reward": 1.8848071098327637, |
| "reward_std": 1.2892030477523804, |
| "rewards/reward_func/mean": 1.8848071098327637, |
| "rewards/reward_func/std": 1.2892030477523804, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.8551710844039917, |
| "sampling/importance_sampling_ratio/mean": 0.9889999032020569, |
| "sampling/importance_sampling_ratio/min": 0.2573316991329193, |
| "sampling/sampling_logp_difference/max": 1.3573893308639526, |
| "sampling/sampling_logp_difference/mean": 0.022047821432352066, |
| "step": 150, |
| "step_time": 301.8369232052937 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15122.0, |
| "completions/max_terminated_length": 15122.0, |
| "completions/mean_length": 9705.6875, |
| "completions/mean_terminated_length": 9705.6875, |
| "completions/min_length": 4756.0, |
| "completions/min_terminated_length": 4756.0, |
| "entropy": 0.2828159620985389, |
| "epoch": 1.2276422764227641, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4180329144001007, |
| "learning_rate": 4e-07, |
| "loss": 0.0365, |
| "num_tokens": 51209559.0, |
| "reward": 1.167612075805664, |
| "reward_std": 0.3664146959781647, |
| "rewards/reward_func/mean": 1.167612075805664, |
| "rewards/reward_func/std": 0.3664146959781647, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.355583906173706, |
| "sampling/importance_sampling_ratio/mean": 0.990098774433136, |
| "sampling/importance_sampling_ratio/min": 0.007337617687880993, |
| "sampling/sampling_logp_difference/max": 4.914741039276123, |
| "sampling/sampling_logp_difference/mean": 0.020141858607530594, |
| "step": 151, |
| "step_time": 430.1349473600276 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16404.0, |
| "completions/max_terminated_length": 16404.0, |
| "completions/mean_length": 9639.8125, |
| "completions/mean_terminated_length": 9639.8125, |
| "completions/min_length": 2322.0, |
| "completions/min_terminated_length": 2322.0, |
| "entropy": 0.2881935928016901, |
| "epoch": 1.2357723577235773, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3870021402835846, |
| "learning_rate": 3.96e-07, |
| "loss": 0.0149, |
| "num_tokens": 51537321.0, |
| "reward": 0.9652100205421448, |
| "reward_std": 0.6336724758148193, |
| "rewards/reward_func/mean": 0.9653626084327698, |
| "rewards/reward_func/std": 0.6334443092346191, |
| "rewards/soft_overlong_punishment_reward/mean": -0.000152587890625, |
| "rewards/soft_overlong_punishment_reward/std": 0.0008631674572825432, |
| "sampling/importance_sampling_ratio/max": 2.0554513931274414, |
| "sampling/importance_sampling_ratio/mean": 0.9902279376983643, |
| "sampling/importance_sampling_ratio/min": 0.1997445672750473, |
| "sampling/sampling_logp_difference/max": 1.6107158660888672, |
| "sampling/sampling_logp_difference/mean": 0.020111925899982452, |
| "step": 152, |
| "step_time": 403.21197831467725 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15311.0, |
| "completions/max_terminated_length": 15311.0, |
| "completions/mean_length": 9762.4375, |
| "completions/mean_terminated_length": 9762.4375, |
| "completions/min_length": 4994.0, |
| "completions/min_terminated_length": 4994.0, |
| "entropy": 0.26577026676386595, |
| "epoch": 1.2439024390243902, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9132823944091797, |
| "learning_rate": 3.92e-07, |
| "loss": 0.0401, |
| "num_tokens": 51871503.0, |
| "reward": 1.0906537771224976, |
| "reward_std": 1.0960371494293213, |
| "rewards/reward_func/mean": 1.0906537771224976, |
| "rewards/reward_func/std": 1.0960370302200317, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9909335374832153, |
| "sampling/importance_sampling_ratio/min": 0.032161299139261246, |
| "sampling/sampling_logp_difference/max": 3.4369914531707764, |
| "sampling/sampling_logp_difference/mean": 0.018757443875074387, |
| "step": 153, |
| "step_time": 492.0377260663081 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15140.0, |
| "completions/max_terminated_length": 15140.0, |
| "completions/mean_length": 8624.96875, |
| "completions/mean_terminated_length": 8624.96875, |
| "completions/min_length": 4203.0, |
| "completions/min_terminated_length": 4203.0, |
| "entropy": 0.28610084764659405, |
| "epoch": 1.2520325203252032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4432617127895355, |
| "learning_rate": 3.88e-07, |
| "loss": 0.0781, |
| "num_tokens": 52167454.0, |
| "reward": 1.4906249046325684, |
| "reward_std": 1.0208377838134766, |
| "rewards/reward_func/mean": 1.4906249046325684, |
| "rewards/reward_func/std": 1.0208377838134766, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.563368558883667, |
| "sampling/importance_sampling_ratio/mean": 0.9902315139770508, |
| "sampling/importance_sampling_ratio/min": 0.36291512846946716, |
| "sampling/sampling_logp_difference/max": 1.0135862827301025, |
| "sampling/sampling_logp_difference/mean": 0.02024753764271736, |
| "step": 154, |
| "step_time": 493.67347326362506 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13499.0, |
| "completions/max_terminated_length": 13499.0, |
| "completions/mean_length": 10185.09375, |
| "completions/mean_terminated_length": 10185.09375, |
| "completions/min_length": 5779.0, |
| "completions/min_terminated_length": 5779.0, |
| "entropy": 0.2801782637834549, |
| "epoch": 1.2601626016260163, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3910183310508728, |
| "learning_rate": 3.84e-07, |
| "loss": 0.0005, |
| "num_tokens": 52513713.0, |
| "reward": 1.3181421756744385, |
| "reward_std": 0.4532380700111389, |
| "rewards/reward_func/mean": 1.3181421756744385, |
| "rewards/reward_func/std": 0.45323804020881653, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9903202056884766, |
| "sampling/importance_sampling_ratio/min": 0.3429674208164215, |
| "sampling/sampling_logp_difference/max": 1.1217701435089111, |
| "sampling/sampling_logp_difference/mean": 0.01968192681670189, |
| "step": 155, |
| "step_time": 330.93778187967837 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15302.0, |
| "completions/max_terminated_length": 15302.0, |
| "completions/mean_length": 9648.78125, |
| "completions/mean_terminated_length": 9648.78125, |
| "completions/min_length": 5341.0, |
| "completions/min_terminated_length": 5341.0, |
| "entropy": 0.2959721256047487, |
| "epoch": 1.2682926829268293, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3946113884449005, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": -0.0071, |
| "num_tokens": 52837898.0, |
| "reward": 1.5226918458938599, |
| "reward_std": 1.2326607704162598, |
| "rewards/reward_func/mean": 1.5226918458938599, |
| "rewards/reward_func/std": 1.2326607704162598, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.7765085697174072, |
| "sampling/importance_sampling_ratio/mean": 0.9898279905319214, |
| "sampling/importance_sampling_ratio/min": 0.2457374781370163, |
| "sampling/sampling_logp_difference/max": 1.403491497039795, |
| "sampling/sampling_logp_difference/mean": 0.021123500540852547, |
| "step": 156, |
| "step_time": 403.72100708354264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12292.0, |
| "completions/max_terminated_length": 12292.0, |
| "completions/mean_length": 8318.3125, |
| "completions/mean_terminated_length": 8318.3125, |
| "completions/min_length": 4767.0, |
| "completions/min_terminated_length": 4767.0, |
| "entropy": 0.3122062496840954, |
| "epoch": 1.2764227642276422, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.45133042335510254, |
| "learning_rate": 3.76e-07, |
| "loss": -0.0431, |
| "num_tokens": 53113636.0, |
| "reward": 1.1949347257614136, |
| "reward_std": 1.1354265213012695, |
| "rewards/reward_func/mean": 1.1949347257614136, |
| "rewards/reward_func/std": 1.1354265213012695, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0942885875701904, |
| "sampling/importance_sampling_ratio/mean": 0.9893242120742798, |
| "sampling/importance_sampling_ratio/min": 0.0679553896188736, |
| "sampling/sampling_logp_difference/max": 2.68890380859375, |
| "sampling/sampling_logp_difference/mean": 0.021612364798784256, |
| "step": 157, |
| "step_time": 298.1043352710549 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16644.0, |
| "completions/max_terminated_length": 16644.0, |
| "completions/mean_length": 9605.53125, |
| "completions/mean_terminated_length": 9605.53125, |
| "completions/min_length": 2687.0, |
| "completions/min_terminated_length": 2687.0, |
| "entropy": 0.3122168593108654, |
| "epoch": 1.2845528455284554, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4598052203655243, |
| "learning_rate": 3.72e-07, |
| "loss": 0.011, |
| "num_tokens": 53431205.0, |
| "reward": 1.7465304136276245, |
| "reward_std": 1.7538690567016602, |
| "rewards/reward_func/mean": 1.7485140562057495, |
| "rewards/reward_func/std": 1.7518490552902222, |
| "rewards/soft_overlong_punishment_reward/mean": -0.001983642578125, |
| "rewards/soft_overlong_punishment_reward/std": 0.011221176944673061, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9891939163208008, |
| "sampling/importance_sampling_ratio/min": 0.3125458359718323, |
| "sampling/sampling_logp_difference/max": 1.1630041599273682, |
| "sampling/sampling_logp_difference/mean": 0.021935712546110153, |
| "step": 158, |
| "step_time": 374.17676453036256 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14982.0, |
| "completions/max_terminated_length": 14982.0, |
| "completions/mean_length": 11967.25, |
| "completions/mean_terminated_length": 11967.25, |
| "completions/min_length": 9173.0, |
| "completions/min_terminated_length": 9173.0, |
| "entropy": 0.2647007992491126, |
| "epoch": 1.2926829268292683, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34376952052116394, |
| "learning_rate": 3.6799999999999996e-07, |
| "loss": 0.0327, |
| "num_tokens": 53833773.0, |
| "reward": 0.8059707880020142, |
| "reward_std": 0.7507649660110474, |
| "rewards/reward_func/mean": 0.8059707880020142, |
| "rewards/reward_func/std": 0.7507650256156921, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.3729827404022217, |
| "sampling/importance_sampling_ratio/mean": 0.9908524751663208, |
| "sampling/importance_sampling_ratio/min": 5.715029374186997e-07, |
| "sampling/sampling_logp_difference/max": 14.374996185302734, |
| "sampling/sampling_logp_difference/mean": 0.0191708542406559, |
| "step": 159, |
| "step_time": 379.8988157734275 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 10870.0, |
| "completions/max_terminated_length": 10870.0, |
| "completions/mean_length": 8080.4375, |
| "completions/mean_terminated_length": 8080.4375, |
| "completions/min_length": 5016.0, |
| "completions/min_terminated_length": 5016.0, |
| "entropy": 0.30127510614693165, |
| "epoch": 1.3008130081300813, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4035295248031616, |
| "learning_rate": 3.64e-07, |
| "loss": -0.0268, |
| "num_tokens": 54104107.0, |
| "reward": 1.0803825855255127, |
| "reward_std": 0.462587833404541, |
| "rewards/reward_func/mean": 1.0803825855255127, |
| "rewards/reward_func/std": 0.4625878632068634, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0112428665161133, |
| "sampling/importance_sampling_ratio/mean": 0.9894917011260986, |
| "sampling/importance_sampling_ratio/min": 0.26328322291374207, |
| "sampling/sampling_logp_difference/max": 1.3345248699188232, |
| "sampling/sampling_logp_difference/mean": 0.021139763295650482, |
| "step": 160, |
| "step_time": 253.5028428370133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 11090.0, |
| "completions/max_terminated_length": 11090.0, |
| "completions/mean_length": 5596.53125, |
| "completions/mean_terminated_length": 5596.53125, |
| "completions/min_length": 2069.0, |
| "completions/min_terminated_length": 2069.0, |
| "entropy": 0.33193016052246094, |
| "epoch": 1.3089430894308944, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.546795666217804, |
| "learning_rate": 3.6e-07, |
| "loss": -0.0351, |
| "num_tokens": 54291492.0, |
| "reward": 1.0095927715301514, |
| "reward_std": 0.5363887548446655, |
| "rewards/reward_func/mean": 1.0095927715301514, |
| "rewards/reward_func/std": 0.5363887548446655, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0163543224334717, |
| "sampling/importance_sampling_ratio/mean": 0.9889829158782959, |
| "sampling/importance_sampling_ratio/min": 0.24333754181861877, |
| "sampling/sampling_logp_difference/max": 1.4133057594299316, |
| "sampling/sampling_logp_difference/mean": 0.022219812497496605, |
| "step": 161, |
| "step_time": 240.68693689699285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14764.0, |
| "completions/max_terminated_length": 14764.0, |
| "completions/mean_length": 10763.0625, |
| "completions/mean_terminated_length": 10763.0625, |
| "completions/min_length": 7169.0, |
| "completions/min_terminated_length": 7169.0, |
| "entropy": 0.28575049340724945, |
| "epoch": 1.3170731707317074, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.330822229385376, |
| "learning_rate": 3.5599999999999996e-07, |
| "loss": 0.0071, |
| "num_tokens": 54660694.0, |
| "reward": 0.9929087162017822, |
| "reward_std": 0.6908224821090698, |
| "rewards/reward_func/mean": 0.9929087162017822, |
| "rewards/reward_func/std": 0.690822422504425, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9898582100868225, |
| "sampling/importance_sampling_ratio/min": 0.14805620908737183, |
| "sampling/sampling_logp_difference/max": 1.9101632833480835, |
| "sampling/sampling_logp_difference/mean": 0.020126353949308395, |
| "step": 162, |
| "step_time": 551.6217741372529 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15333.0, |
| "completions/max_terminated_length": 15333.0, |
| "completions/mean_length": 11018.78125, |
| "completions/mean_terminated_length": 11018.78125, |
| "completions/min_length": 8906.0, |
| "completions/min_terminated_length": 8906.0, |
| "entropy": 0.2753280848264694, |
| "epoch": 1.3252032520325203, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3719634711742401, |
| "learning_rate": 3.52e-07, |
| "loss": 0.0282, |
| "num_tokens": 55038287.0, |
| "reward": 1.00343656539917, |
| "reward_std": 0.800672173500061, |
| "rewards/reward_func/mean": 1.00343656539917, |
| "rewards/reward_func/std": 0.800672173500061, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.007370710372925, |
| "sampling/importance_sampling_ratio/mean": 0.9903607964515686, |
| "sampling/importance_sampling_ratio/min": 0.00020890739688184112, |
| "sampling/sampling_logp_difference/max": 8.47361946105957, |
| "sampling/sampling_logp_difference/mean": 0.019656600430607796, |
| "step": 163, |
| "step_time": 544.9398823149968 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15932.0, |
| "completions/max_terminated_length": 15932.0, |
| "completions/mean_length": 10442.1875, |
| "completions/mean_terminated_length": 10442.1875, |
| "completions/min_length": 6051.0, |
| "completions/min_terminated_length": 6051.0, |
| "entropy": 0.30813820846378803, |
| "epoch": 1.3333333333333333, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3646389245986938, |
| "learning_rate": 3.4799999999999994e-07, |
| "loss": 0.025, |
| "num_tokens": 55389725.0, |
| "reward": 1.1668248176574707, |
| "reward_std": 0.6954712867736816, |
| "rewards/reward_func/mean": 1.1668248176574707, |
| "rewards/reward_func/std": 0.6954712867736816, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.9248101711273193, |
| "sampling/importance_sampling_ratio/mean": 0.9890936613082886, |
| "sampling/importance_sampling_ratio/min": 0.008675693534314632, |
| "sampling/sampling_logp_difference/max": 4.747230052947998, |
| "sampling/sampling_logp_difference/mean": 0.021686844527721405, |
| "step": 164, |
| "step_time": 348.8858406627551 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14837.0, |
| "completions/max_terminated_length": 14837.0, |
| "completions/mean_length": 8969.65625, |
| "completions/mean_terminated_length": 8969.65625, |
| "completions/min_length": 5282.0, |
| "completions/min_terminated_length": 5282.0, |
| "entropy": 0.2883797576650977, |
| "epoch": 1.3414634146341464, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4131164252758026, |
| "learning_rate": 3.4399999999999996e-07, |
| "loss": 0.0886, |
| "num_tokens": 55692242.0, |
| "reward": 1.4298856258392334, |
| "reward_std": 1.167038917541504, |
| "rewards/reward_func/mean": 1.4298856258392334, |
| "rewards/reward_func/std": 1.167038917541504, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2284605503082275, |
| "sampling/importance_sampling_ratio/mean": 0.9899915456771851, |
| "sampling/importance_sampling_ratio/min": 0.36445239186286926, |
| "sampling/sampling_logp_difference/max": 1.009359359741211, |
| "sampling/sampling_logp_difference/mean": 0.019986702129244804, |
| "step": 165, |
| "step_time": 543.1110370436218 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14801.0, |
| "completions/max_terminated_length": 14801.0, |
| "completions/mean_length": 9674.0625, |
| "completions/mean_terminated_length": 9674.0625, |
| "completions/min_length": 5760.0, |
| "completions/min_terminated_length": 5760.0, |
| "entropy": 0.2983303349465132, |
| "epoch": 1.3495934959349594, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 5.761723518371582, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": 0.022, |
| "num_tokens": 56013684.0, |
| "reward": 1.345805287361145, |
| "reward_std": 0.7664235830307007, |
| "rewards/reward_func/mean": 1.345805287361145, |
| "rewards/reward_func/std": 0.7664235830307007, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5187416076660156, |
| "sampling/importance_sampling_ratio/mean": 0.9897916316986084, |
| "sampling/importance_sampling_ratio/min": 0.25386354327201843, |
| "sampling/sampling_logp_difference/max": 1.3709583282470703, |
| "sampling/sampling_logp_difference/mean": 0.02112661674618721, |
| "step": 166, |
| "step_time": 493.43985287938267 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15028.0, |
| "completions/max_terminated_length": 15028.0, |
| "completions/mean_length": 9629.9375, |
| "completions/mean_terminated_length": 9629.9375, |
| "completions/min_length": 3882.0, |
| "completions/min_terminated_length": 3882.0, |
| "entropy": 0.26501744147390127, |
| "epoch": 1.3577235772357723, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4020431339740753, |
| "learning_rate": 3.36e-07, |
| "loss": 0.0422, |
| "num_tokens": 56355258.0, |
| "reward": 1.1285045146942139, |
| "reward_std": 0.537778377532959, |
| "rewards/reward_func/mean": 1.1285045146942139, |
| "rewards/reward_func/std": 0.537778377532959, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9907045364379883, |
| "sampling/importance_sampling_ratio/min": 0.003646957455202937, |
| "sampling/sampling_logp_difference/max": 5.613862037658691, |
| "sampling/sampling_logp_difference/mean": 0.01883198693394661, |
| "step": 167, |
| "step_time": 346.18117096996866 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14349.0, |
| "completions/max_terminated_length": 14349.0, |
| "completions/mean_length": 8990.5625, |
| "completions/mean_terminated_length": 8990.5625, |
| "completions/min_length": 5768.0, |
| "completions/min_terminated_length": 5768.0, |
| "entropy": 0.3040621541440487, |
| "epoch": 1.3658536585365852, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4271112084388733, |
| "learning_rate": 3.32e-07, |
| "loss": -0.0135, |
| "num_tokens": 56657932.0, |
| "reward": 1.023315668106079, |
| "reward_std": 0.5972030162811279, |
| "rewards/reward_func/mean": 1.023315668106079, |
| "rewards/reward_func/std": 0.5972029566764832, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.104907512664795, |
| "sampling/importance_sampling_ratio/mean": 0.9894360303878784, |
| "sampling/importance_sampling_ratio/min": 0.331084668636322, |
| "sampling/sampling_logp_difference/max": 1.1053811311721802, |
| "sampling/sampling_logp_difference/mean": 0.021181510761380196, |
| "step": 168, |
| "step_time": 523.4633230310865 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13526.0, |
| "completions/max_terminated_length": 13526.0, |
| "completions/mean_length": 6832.15625, |
| "completions/mean_terminated_length": 6832.15625, |
| "completions/min_length": 2270.0, |
| "completions/min_terminated_length": 2270.0, |
| "entropy": 0.3238808959722519, |
| "epoch": 1.3739837398373984, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5131815671920776, |
| "learning_rate": 3.28e-07, |
| "loss": -0.0588, |
| "num_tokens": 56888113.0, |
| "reward": 1.2511417865753174, |
| "reward_std": 0.5019726157188416, |
| "rewards/reward_func/mean": 1.2511417865753174, |
| "rewards/reward_func/std": 0.5019726157188416, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.56583309173584, |
| "sampling/importance_sampling_ratio/mean": 0.9888361692428589, |
| "sampling/importance_sampling_ratio/min": 0.42743268609046936, |
| "sampling/sampling_logp_difference/max": 0.9422832727432251, |
| "sampling/sampling_logp_difference/mean": 0.022333543747663498, |
| "step": 169, |
| "step_time": 282.8712672749534 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17421.0, |
| "completions/max_terminated_length": 17421.0, |
| "completions/mean_length": 10190.46875, |
| "completions/mean_terminated_length": 10190.46875, |
| "completions/min_length": 6579.0, |
| "completions/min_terminated_length": 6579.0, |
| "entropy": 0.305131521075964, |
| "epoch": 1.3821138211382114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3988380432128906, |
| "learning_rate": 3.24e-07, |
| "loss": 0.0232, |
| "num_tokens": 57231496.0, |
| "reward": 1.644756555557251, |
| "reward_std": 1.3392294645309448, |
| "rewards/reward_func/mean": 1.6526682376861572, |
| "rewards/reward_func/std": 1.3286665678024292, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00791168212890625, |
| "rewards/soft_overlong_punishment_reward/std": 0.0447552315890789, |
| "sampling/importance_sampling_ratio/max": 2.1310908794403076, |
| "sampling/importance_sampling_ratio/mean": 0.989220142364502, |
| "sampling/importance_sampling_ratio/min": 0.04181424155831337, |
| "sampling/sampling_logp_difference/max": 3.174518346786499, |
| "sampling/sampling_logp_difference/mean": 0.021304449066519737, |
| "step": 170, |
| "step_time": 494.5488368184306 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13615.0, |
| "completions/max_terminated_length": 13615.0, |
| "completions/mean_length": 8635.6875, |
| "completions/mean_terminated_length": 8635.6875, |
| "completions/min_length": 3753.0, |
| "completions/min_terminated_length": 3753.0, |
| "entropy": 0.315045366063714, |
| "epoch": 1.3902439024390243, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4601956605911255, |
| "learning_rate": 3.2e-07, |
| "loss": 0.009, |
| "num_tokens": 57518430.0, |
| "reward": 1.3443162441253662, |
| "reward_std": 0.39488282799720764, |
| "rewards/reward_func/mean": 1.3443162441253662, |
| "rewards/reward_func/std": 0.39488279819488525, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.062009572982788, |
| "sampling/importance_sampling_ratio/mean": 0.9892582893371582, |
| "sampling/importance_sampling_ratio/min": 0.43613678216934204, |
| "sampling/sampling_logp_difference/max": 0.8297994136810303, |
| "sampling/sampling_logp_difference/mean": 0.021855181083083153, |
| "step": 171, |
| "step_time": 453.36320838704705 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13453.0, |
| "completions/max_terminated_length": 13453.0, |
| "completions/mean_length": 8726.84375, |
| "completions/mean_terminated_length": 8726.84375, |
| "completions/min_length": 6157.0, |
| "completions/min_terminated_length": 6157.0, |
| "entropy": 0.3162997905164957, |
| "epoch": 1.3983739837398375, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4318312704563141, |
| "learning_rate": 3.1599999999999997e-07, |
| "loss": 0.0394, |
| "num_tokens": 57809209.0, |
| "reward": 1.3840892314910889, |
| "reward_std": 0.6176471710205078, |
| "rewards/reward_func/mean": 1.3840892314910889, |
| "rewards/reward_func/std": 0.6176471710205078, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5114269256591797, |
| "sampling/importance_sampling_ratio/mean": 0.9889503717422485, |
| "sampling/importance_sampling_ratio/min": 0.022857630625367165, |
| "sampling/sampling_logp_difference/max": 3.778470277786255, |
| "sampling/sampling_logp_difference/mean": 0.02213275618851185, |
| "step": 172, |
| "step_time": 301.59747786587104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13227.0, |
| "completions/max_terminated_length": 13227.0, |
| "completions/mean_length": 8122.09375, |
| "completions/mean_terminated_length": 8122.09375, |
| "completions/min_length": 4053.0, |
| "completions/min_terminated_length": 4053.0, |
| "entropy": 0.2920794412493706, |
| "epoch": 1.4065040650406504, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.44486507773399353, |
| "learning_rate": 3.12e-07, |
| "loss": 0.0322, |
| "num_tokens": 58092372.0, |
| "reward": 1.4264354705810547, |
| "reward_std": 0.8911484479904175, |
| "rewards/reward_func/mean": 1.4264354705810547, |
| "rewards/reward_func/std": 0.8911485075950623, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9848713874816895, |
| "sampling/importance_sampling_ratio/mean": 0.9898254871368408, |
| "sampling/importance_sampling_ratio/min": 0.0660436749458313, |
| "sampling/sampling_logp_difference/max": 2.7174389362335205, |
| "sampling/sampling_logp_difference/mean": 0.02044767141342163, |
| "step": 173, |
| "step_time": 399.5025182967074 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15866.0, |
| "completions/max_terminated_length": 15866.0, |
| "completions/mean_length": 8942.84375, |
| "completions/mean_terminated_length": 8942.84375, |
| "completions/min_length": 5281.0, |
| "completions/min_terminated_length": 5281.0, |
| "entropy": 0.28330389875918627, |
| "epoch": 1.4146341463414633, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.43173524737358093, |
| "learning_rate": 3.08e-07, |
| "loss": 0.0439, |
| "num_tokens": 58399383.0, |
| "reward": 1.0660076141357422, |
| "reward_std": 0.5465989112854004, |
| "rewards/reward_func/mean": 1.0660076141357422, |
| "rewards/reward_func/std": 0.5465989708900452, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1449825763702393, |
| "sampling/importance_sampling_ratio/mean": 0.9901489019393921, |
| "sampling/importance_sampling_ratio/min": 0.2062222957611084, |
| "sampling/sampling_logp_difference/max": 1.5788005590438843, |
| "sampling/sampling_logp_difference/mean": 0.01982881873846054, |
| "step": 174, |
| "step_time": 378.2133640507236 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16162.0, |
| "completions/max_terminated_length": 16162.0, |
| "completions/mean_length": 7844.625, |
| "completions/mean_terminated_length": 7844.625, |
| "completions/min_length": 2988.0, |
| "completions/min_terminated_length": 2988.0, |
| "entropy": 0.31094569712877274, |
| "epoch": 1.4227642276422765, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4638010263442993, |
| "learning_rate": 3.0399999999999997e-07, |
| "loss": -0.0176, |
| "num_tokens": 58662163.0, |
| "reward": 1.745612382888794, |
| "reward_std": 1.4541947841644287, |
| "rewards/reward_func/mean": 1.745612382888794, |
| "rewards/reward_func/std": 1.4541946649551392, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7884289026260376, |
| "sampling/importance_sampling_ratio/mean": 0.9891934990882874, |
| "sampling/importance_sampling_ratio/min": 0.24390622973442078, |
| "sampling/sampling_logp_difference/max": 1.4109714031219482, |
| "sampling/sampling_logp_difference/mean": 0.021349426358938217, |
| "step": 175, |
| "step_time": 534.4618561277166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12543.0, |
| "completions/max_terminated_length": 12543.0, |
| "completions/mean_length": 8796.625, |
| "completions/mean_terminated_length": 8796.625, |
| "completions/min_length": 2902.0, |
| "completions/min_terminated_length": 2902.0, |
| "entropy": 0.2846886198967695, |
| "epoch": 1.4308943089430894, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3884636461734772, |
| "learning_rate": 3e-07, |
| "loss": -0.0042, |
| "num_tokens": 58965455.0, |
| "reward": 1.2353037595748901, |
| "reward_std": 0.3446652889251709, |
| "rewards/reward_func/mean": 1.2353037595748901, |
| "rewards/reward_func/std": 0.3446652591228485, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.538331985473633, |
| "sampling/importance_sampling_ratio/mean": 0.9902387857437134, |
| "sampling/importance_sampling_ratio/min": 0.05407044291496277, |
| "sampling/sampling_logp_difference/max": 2.9174675941467285, |
| "sampling/sampling_logp_difference/mean": 0.019742220640182495, |
| "step": 176, |
| "step_time": 344.33025532308966 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15643.0, |
| "completions/max_terminated_length": 15643.0, |
| "completions/mean_length": 10607.03125, |
| "completions/mean_terminated_length": 10607.03125, |
| "completions/min_length": 7345.0, |
| "completions/min_terminated_length": 7345.0, |
| "entropy": 0.3004197124391794, |
| "epoch": 1.4390243902439024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3924950659275055, |
| "learning_rate": 2.9599999999999995e-07, |
| "loss": 0.007, |
| "num_tokens": 59328144.0, |
| "reward": 0.827019214630127, |
| "reward_std": 0.663148045539856, |
| "rewards/reward_func/mean": 0.827019214630127, |
| "rewards/reward_func/std": 0.663148045539856, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9895559549331665, |
| "sampling/importance_sampling_ratio/min": 0.00959984865039587, |
| "sampling/sampling_logp_difference/max": 4.646008014678955, |
| "sampling/sampling_logp_difference/mean": 0.021044539287686348, |
| "step": 177, |
| "step_time": 421.6234840967227 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13202.0, |
| "completions/max_terminated_length": 13202.0, |
| "completions/mean_length": 8350.4375, |
| "completions/mean_terminated_length": 8350.4375, |
| "completions/min_length": 6289.0, |
| "completions/min_terminated_length": 6289.0, |
| "entropy": 0.3156953826546669, |
| "epoch": 1.4471544715447155, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4544045627117157, |
| "learning_rate": 2.9199999999999997e-07, |
| "loss": 0.0208, |
| "num_tokens": 59610238.0, |
| "reward": 0.9881260395050049, |
| "reward_std": 0.4920128583908081, |
| "rewards/reward_func/mean": 0.9881260395050049, |
| "rewards/reward_func/std": 0.4920128285884857, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.523590087890625, |
| "sampling/importance_sampling_ratio/mean": 0.9888817071914673, |
| "sampling/importance_sampling_ratio/min": 0.265560507774353, |
| "sampling/sampling_logp_difference/max": 1.325912594795227, |
| "sampling/sampling_logp_difference/mean": 0.021911103278398514, |
| "step": 178, |
| "step_time": 292.23001111787744 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12649.0, |
| "completions/max_terminated_length": 12649.0, |
| "completions/mean_length": 10555.78125, |
| "completions/mean_terminated_length": 10555.78125, |
| "completions/min_length": 7665.0, |
| "completions/min_terminated_length": 7665.0, |
| "entropy": 0.28271804563701153, |
| "epoch": 1.4552845528455285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36825451254844666, |
| "learning_rate": 2.88e-07, |
| "loss": -0.0295, |
| "num_tokens": 59966983.0, |
| "reward": 1.053284764289856, |
| "reward_std": 0.481535404920578, |
| "rewards/reward_func/mean": 1.053284764289856, |
| "rewards/reward_func/std": 0.4815354347229004, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.7443697452545166, |
| "sampling/importance_sampling_ratio/mean": 0.9902329444885254, |
| "sampling/importance_sampling_ratio/min": 0.21681459248065948, |
| "sampling/sampling_logp_difference/max": 1.5287127494812012, |
| "sampling/sampling_logp_difference/mean": 0.019804177805781364, |
| "step": 179, |
| "step_time": 316.3304076856002 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 18300.0, |
| "completions/max_terminated_length": 18300.0, |
| "completions/mean_length": 9951.625, |
| "completions/mean_terminated_length": 9951.625, |
| "completions/min_length": 2899.0, |
| "completions/min_terminated_length": 2899.0, |
| "entropy": 0.25757203437387943, |
| "epoch": 1.4634146341463414, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38143888115882874, |
| "learning_rate": 2.8399999999999995e-07, |
| "loss": 0.0341, |
| "num_tokens": 60302987.0, |
| "reward": 0.9845483303070068, |
| "reward_std": 0.8336848616600037, |
| "rewards/reward_func/mean": 0.9991662502288818, |
| "rewards/reward_func/std": 0.8121254444122314, |
| "rewards/soft_overlong_punishment_reward/mean": -0.014617919921875, |
| "rewards/soft_overlong_punishment_reward/std": 0.08269143849611282, |
| "sampling/importance_sampling_ratio/max": 2.355394124984741, |
| "sampling/importance_sampling_ratio/mean": 0.990979790687561, |
| "sampling/importance_sampling_ratio/min": 0.3311867415904999, |
| "sampling/sampling_logp_difference/max": 1.1050728559494019, |
| "sampling/sampling_logp_difference/mean": 0.018629489466547966, |
| "step": 180, |
| "step_time": 444.39584999810904 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13861.0, |
| "completions/max_terminated_length": 13861.0, |
| "completions/mean_length": 8278.9375, |
| "completions/mean_terminated_length": 8278.9375, |
| "completions/min_length": 5209.0, |
| "completions/min_terminated_length": 5209.0, |
| "entropy": 0.305585864931345, |
| "epoch": 1.4715447154471546, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.40094515681266785, |
| "learning_rate": 2.8e-07, |
| "loss": 0.0255, |
| "num_tokens": 60582913.0, |
| "reward": 1.5346317291259766, |
| "reward_std": 1.287915825843811, |
| "rewards/reward_func/mean": 1.5346317291259766, |
| "rewards/reward_func/std": 1.2879159450531006, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.060699701309204, |
| "sampling/importance_sampling_ratio/mean": 0.9892938137054443, |
| "sampling/importance_sampling_ratio/min": 0.18372268974781036, |
| "sampling/sampling_logp_difference/max": 1.6943278312683105, |
| "sampling/sampling_logp_difference/mean": 0.021300731226801872, |
| "step": 181, |
| "step_time": 307.09654345782474 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15362.0, |
| "completions/max_terminated_length": 15362.0, |
| "completions/mean_length": 9544.125, |
| "completions/mean_terminated_length": 9544.125, |
| "completions/min_length": 6910.0, |
| "completions/min_terminated_length": 6910.0, |
| "entropy": 0.29761974327266216, |
| "epoch": 1.4796747967479675, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4058556854724884, |
| "learning_rate": 2.7600000000000004e-07, |
| "loss": 0.0353, |
| "num_tokens": 60904933.0, |
| "reward": 1.4360902309417725, |
| "reward_std": 1.2418161630630493, |
| "rewards/reward_func/mean": 1.4360902309417725, |
| "rewards/reward_func/std": 1.2418162822723389, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.379150629043579, |
| "sampling/importance_sampling_ratio/mean": 0.9895496368408203, |
| "sampling/importance_sampling_ratio/min": 0.0527670793235302, |
| "sampling/sampling_logp_difference/max": 2.9418678283691406, |
| "sampling/sampling_logp_difference/mean": 0.020994048565626144, |
| "step": 182, |
| "step_time": 385.816073252121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14594.0, |
| "completions/max_terminated_length": 14594.0, |
| "completions/mean_length": 10289.25, |
| "completions/mean_terminated_length": 10289.25, |
| "completions/min_length": 7779.0, |
| "completions/min_terminated_length": 7779.0, |
| "entropy": 0.27037277817726135, |
| "epoch": 1.4878048780487805, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37331607937812805, |
| "learning_rate": 2.72e-07, |
| "loss": 0.0217, |
| "num_tokens": 61268189.0, |
| "reward": 0.7746168375015259, |
| "reward_std": 0.6733230948448181, |
| "rewards/reward_func/mean": 0.7746168375015259, |
| "rewards/reward_func/std": 0.6733230948448181, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1012680530548096, |
| "sampling/importance_sampling_ratio/mean": 0.990485668182373, |
| "sampling/importance_sampling_ratio/min": 0.30445200204849243, |
| "sampling/sampling_logp_difference/max": 1.189241886138916, |
| "sampling/sampling_logp_difference/mean": 0.01935386285185814, |
| "step": 183, |
| "step_time": 546.588172652293 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13994.0, |
| "completions/max_terminated_length": 13994.0, |
| "completions/mean_length": 9043.59375, |
| "completions/mean_terminated_length": 9043.59375, |
| "completions/min_length": 6681.0, |
| "completions/min_terminated_length": 6681.0, |
| "entropy": 0.29701266437768936, |
| "epoch": 1.4959349593495934, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41282883286476135, |
| "learning_rate": 2.68e-07, |
| "loss": -0.0133, |
| "num_tokens": 61573856.0, |
| "reward": 1.3877496719360352, |
| "reward_std": 0.6844647526741028, |
| "rewards/reward_func/mean": 1.3877496719360352, |
| "rewards/reward_func/std": 0.6844647526741028, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7882211208343506, |
| "sampling/importance_sampling_ratio/mean": 0.9894493818283081, |
| "sampling/importance_sampling_ratio/min": 0.3472015857696533, |
| "sampling/sampling_logp_difference/max": 1.057849645614624, |
| "sampling/sampling_logp_difference/mean": 0.02082856558263302, |
| "step": 184, |
| "step_time": 348.1651652737055 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13691.0, |
| "completions/max_terminated_length": 13691.0, |
| "completions/mean_length": 8973.8125, |
| "completions/mean_terminated_length": 8973.8125, |
| "completions/min_length": 5585.0, |
| "completions/min_terminated_length": 5585.0, |
| "entropy": 0.28570630215108395, |
| "epoch": 1.5040650406504064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4094651937484741, |
| "learning_rate": 2.64e-07, |
| "loss": 0.0304, |
| "num_tokens": 61875338.0, |
| "reward": 1.4425820112228394, |
| "reward_std": 1.3329182863235474, |
| "rewards/reward_func/mean": 1.4425820112228394, |
| "rewards/reward_func/std": 1.3329182863235474, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9900859594345093, |
| "sampling/importance_sampling_ratio/min": 0.11671532690525055, |
| "sampling/sampling_logp_difference/max": 2.148017406463623, |
| "sampling/sampling_logp_difference/mean": 0.01998751237988472, |
| "step": 185, |
| "step_time": 517.4829101127107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15249.0, |
| "completions/max_terminated_length": 15249.0, |
| "completions/mean_length": 8211.6875, |
| "completions/mean_terminated_length": 8211.6875, |
| "completions/min_length": 4763.0, |
| "completions/min_terminated_length": 4763.0, |
| "entropy": 0.3177320044487715, |
| "epoch": 1.5121951219512195, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.9853414297103882, |
| "learning_rate": 2.6e-07, |
| "loss": -0.0002, |
| "num_tokens": 62148608.0, |
| "reward": 1.4485201835632324, |
| "reward_std": 0.19182679057121277, |
| "rewards/reward_func/mean": 1.4485201835632324, |
| "rewards/reward_func/std": 0.1918267458677292, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7517563104629517, |
| "sampling/importance_sampling_ratio/mean": 0.9886523485183716, |
| "sampling/importance_sampling_ratio/min": 0.43156927824020386, |
| "sampling/sampling_logp_difference/max": 0.840327262878418, |
| "sampling/sampling_logp_difference/mean": 0.02235923707485199, |
| "step": 186, |
| "step_time": 319.5429007699713 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16617.0, |
| "completions/max_terminated_length": 16617.0, |
| "completions/mean_length": 8996.59375, |
| "completions/mean_terminated_length": 8996.59375, |
| "completions/min_length": 3658.0, |
| "completions/min_terminated_length": 3658.0, |
| "entropy": 0.2926332140341401, |
| "epoch": 1.5203252032520327, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40174105763435364, |
| "learning_rate": 2.56e-07, |
| "loss": 0.0098, |
| "num_tokens": 62451283.0, |
| "reward": 1.507631778717041, |
| "reward_std": 1.6578742265701294, |
| "rewards/reward_func/mean": 1.5094094276428223, |
| "rewards/reward_func/std": 1.6561721563339233, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00177764892578125, |
| "rewards/soft_overlong_punishment_reward/std": 0.010055900551378727, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902303814888, |
| "sampling/importance_sampling_ratio/min": 0.30990007519721985, |
| "sampling/sampling_logp_difference/max": 1.633288860321045, |
| "sampling/sampling_logp_difference/mean": 0.020211469382047653, |
| "step": 187, |
| "step_time": 336.45058575086296 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16478.0, |
| "completions/max_terminated_length": 16478.0, |
| "completions/mean_length": 10577.6875, |
| "completions/mean_terminated_length": 10577.6875, |
| "completions/min_length": 7560.0, |
| "completions/min_terminated_length": 7560.0, |
| "entropy": 0.2819963004440069, |
| "epoch": 1.5284552845528454, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37996959686279297, |
| "learning_rate": 2.52e-07, |
| "loss": 0.0144, |
| "num_tokens": 62817409.0, |
| "reward": 0.5808703303337097, |
| "reward_std": 0.8241087198257446, |
| "rewards/reward_func/mean": 0.5815874934196472, |
| "rewards/reward_func/std": 0.8235760927200317, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0007171630859375, |
| "rewards/soft_overlong_punishment_reward/std": 0.004056887235492468, |
| "sampling/importance_sampling_ratio/max": 2.636267900466919, |
| "sampling/importance_sampling_ratio/mean": 0.9898158311843872, |
| "sampling/importance_sampling_ratio/min": 0.0027106255292892456, |
| "sampling/sampling_logp_difference/max": 5.910575866699219, |
| "sampling/sampling_logp_difference/mean": 0.02044173702597618, |
| "step": 188, |
| "step_time": 499.72255454421975 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14677.0, |
| "completions/max_terminated_length": 14677.0, |
| "completions/mean_length": 9978.09375, |
| "completions/mean_terminated_length": 9978.09375, |
| "completions/min_length": 4882.0, |
| "completions/min_terminated_length": 4882.0, |
| "entropy": 0.2741777431219816, |
| "epoch": 1.5365853658536586, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3784760534763336, |
| "learning_rate": 2.48e-07, |
| "loss": 0.013, |
| "num_tokens": 63159460.0, |
| "reward": 1.0896389484405518, |
| "reward_std": 0.4881446063518524, |
| "rewards/reward_func/mean": 1.0896389484405518, |
| "rewards/reward_func/std": 0.4881446361541748, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906512498855591, |
| "sampling/importance_sampling_ratio/min": 0.29679495096206665, |
| "sampling/sampling_logp_difference/max": 1.8686549663543701, |
| "sampling/sampling_logp_difference/mean": 0.019687175750732422, |
| "step": 189, |
| "step_time": 319.22378592635505 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15448.0, |
| "completions/max_terminated_length": 15448.0, |
| "completions/mean_length": 10634.625, |
| "completions/mean_terminated_length": 10634.625, |
| "completions/min_length": 6856.0, |
| "completions/min_terminated_length": 6856.0, |
| "entropy": 0.27850321400910616, |
| "epoch": 1.5447154471544715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3891684114933014, |
| "learning_rate": 2.4399999999999996e-07, |
| "loss": 0.0268, |
| "num_tokens": 63523312.0, |
| "reward": 0.9600388407707214, |
| "reward_std": 0.6232390403747559, |
| "rewards/reward_func/mean": 0.9600388407707214, |
| "rewards/reward_func/std": 0.6232389807701111, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9903505444526672, |
| "sampling/importance_sampling_ratio/min": 0.1122722402215004, |
| "sampling/sampling_logp_difference/max": 2.18682861328125, |
| "sampling/sampling_logp_difference/mean": 0.0196834784001112, |
| "step": 190, |
| "step_time": 358.385191940004 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12682.0, |
| "completions/max_terminated_length": 12682.0, |
| "completions/mean_length": 7577.3125, |
| "completions/mean_terminated_length": 7577.3125, |
| "completions/min_length": 1562.0, |
| "completions/min_terminated_length": 1562.0, |
| "entropy": 0.31117005459964275, |
| "epoch": 1.5528455284552845, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4534858167171478, |
| "learning_rate": 2.4e-07, |
| "loss": -0.0218, |
| "num_tokens": 63777218.0, |
| "reward": 1.481154203414917, |
| "reward_std": 0.709784984588623, |
| "rewards/reward_func/mean": 1.481154203414917, |
| "rewards/reward_func/std": 0.7097850441932678, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9891089797019958, |
| "sampling/importance_sampling_ratio/min": 0.2799838185310364, |
| "sampling/sampling_logp_difference/max": 1.3334325551986694, |
| "sampling/sampling_logp_difference/mean": 0.021504439413547516, |
| "step": 191, |
| "step_time": 333.1857100597117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12243.0, |
| "completions/max_terminated_length": 12243.0, |
| "completions/mean_length": 9093.09375, |
| "completions/mean_terminated_length": 9093.09375, |
| "completions/min_length": 5665.0, |
| "completions/min_terminated_length": 5665.0, |
| "entropy": 0.284919373691082, |
| "epoch": 1.5609756097560976, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40635332465171814, |
| "learning_rate": 2.3599999999999997e-07, |
| "loss": -0.0044, |
| "num_tokens": 64081693.0, |
| "reward": 1.447981834411621, |
| "reward_std": 1.2502341270446777, |
| "rewards/reward_func/mean": 1.447981834411621, |
| "rewards/reward_func/std": 1.2502341270446777, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.4054501056671143, |
| "sampling/importance_sampling_ratio/mean": 0.9902962446212769, |
| "sampling/importance_sampling_ratio/min": 0.06248319521546364, |
| "sampling/sampling_logp_difference/max": 2.772857666015625, |
| "sampling/sampling_logp_difference/mean": 0.01986866444349289, |
| "step": 192, |
| "step_time": 292.0563309621066 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15147.0, |
| "completions/max_terminated_length": 15147.0, |
| "completions/mean_length": 10851.03125, |
| "completions/mean_terminated_length": 10851.03125, |
| "completions/min_length": 7781.0, |
| "completions/min_terminated_length": 7781.0, |
| "entropy": 0.29965684935450554, |
| "epoch": 1.5691056910569106, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3997621536254883, |
| "learning_rate": 2.32e-07, |
| "loss": -0.0378, |
| "num_tokens": 64449766.0, |
| "reward": 0.8266834020614624, |
| "reward_std": 0.6421800851821899, |
| "rewards/reward_func/mean": 0.8266834020614624, |
| "rewards/reward_func/std": 0.6421800255775452, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9896982908248901, |
| "sampling/importance_sampling_ratio/min": 0.00025375522091053426, |
| "sampling/sampling_logp_difference/max": 8.27914047241211, |
| "sampling/sampling_logp_difference/mean": 0.021142274141311646, |
| "step": 193, |
| "step_time": 365.25503698992543 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15063.0, |
| "completions/max_terminated_length": 15063.0, |
| "completions/mean_length": 10946.09375, |
| "completions/mean_terminated_length": 10946.09375, |
| "completions/min_length": 6879.0, |
| "completions/min_terminated_length": 6879.0, |
| "entropy": 0.31737131997942924, |
| "epoch": 1.5772357723577235, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.8579146265983582, |
| "learning_rate": 2.28e-07, |
| "loss": 0.0609, |
| "num_tokens": 64812785.0, |
| "reward": 1.1919212341308594, |
| "reward_std": 0.6313185691833496, |
| "rewards/reward_func/mean": 1.1919212341308594, |
| "rewards/reward_func/std": 0.6313185691833496, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2314682006835938, |
| "sampling/importance_sampling_ratio/mean": 0.989063024520874, |
| "sampling/importance_sampling_ratio/min": 0.3655092716217041, |
| "sampling/sampling_logp_difference/max": 1.006463646888733, |
| "sampling/sampling_logp_difference/mean": 0.02223176136612892, |
| "step": 194, |
| "step_time": 351.5693916489836 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14915.0, |
| "completions/max_terminated_length": 14915.0, |
| "completions/mean_length": 10867.5625, |
| "completions/mean_terminated_length": 10867.5625, |
| "completions/min_length": 5934.0, |
| "completions/min_terminated_length": 5934.0, |
| "entropy": 0.2743126470595598, |
| "epoch": 1.5853658536585367, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36443397402763367, |
| "learning_rate": 2.24e-07, |
| "loss": 0.0046, |
| "num_tokens": 65179611.0, |
| "reward": 1.07879638671875, |
| "reward_std": 0.6518019437789917, |
| "rewards/reward_func/mean": 1.07879638671875, |
| "rewards/reward_func/std": 0.6518019437789917, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1472198963165283, |
| "sampling/importance_sampling_ratio/mean": 0.9904861450195312, |
| "sampling/importance_sampling_ratio/min": 0.10317613184452057, |
| "sampling/sampling_logp_difference/max": 2.271317720413208, |
| "sampling/sampling_logp_difference/mean": 0.01932957023382187, |
| "step": 195, |
| "step_time": 342.88379572634585 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14422.0, |
| "completions/max_terminated_length": 14422.0, |
| "completions/mean_length": 9513.5625, |
| "completions/mean_terminated_length": 9513.5625, |
| "completions/min_length": 3494.0, |
| "completions/min_terminated_length": 3494.0, |
| "entropy": 0.28421447053551674, |
| "epoch": 1.5934959349593496, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40306708216667175, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0735, |
| "num_tokens": 65502101.0, |
| "reward": 1.6892921924591064, |
| "reward_std": 1.7240147590637207, |
| "rewards/reward_func/mean": 1.6892921924591064, |
| "rewards/reward_func/std": 1.7240146398544312, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9903287887573242, |
| "sampling/importance_sampling_ratio/min": 0.06248319521546364, |
| "sampling/sampling_logp_difference/max": 2.772857666015625, |
| "sampling/sampling_logp_difference/mean": 0.019995655864477158, |
| "step": 196, |
| "step_time": 333.16898891096935 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15583.0, |
| "completions/max_terminated_length": 15583.0, |
| "completions/mean_length": 11488.0625, |
| "completions/mean_terminated_length": 11488.0625, |
| "completions/min_length": 8134.0, |
| "completions/min_terminated_length": 8134.0, |
| "entropy": 0.27682713977992535, |
| "epoch": 1.6016260162601625, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3545663356781006, |
| "learning_rate": 2.1599999999999998e-07, |
| "loss": 0.0278, |
| "num_tokens": 65893255.0, |
| "reward": 1.7090435028076172, |
| "reward_std": 1.8992475271224976, |
| "rewards/reward_func/mean": 1.7090435028076172, |
| "rewards/reward_func/std": 1.899247646331787, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902373552322388, |
| "sampling/importance_sampling_ratio/min": 0.2602575421333313, |
| "sampling/sampling_logp_difference/max": 1.346083641052246, |
| "sampling/sampling_logp_difference/mean": 0.01981320045888424, |
| "step": 197, |
| "step_time": 370.34518124256283 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12766.0, |
| "completions/max_terminated_length": 12766.0, |
| "completions/mean_length": 9726.1875, |
| "completions/mean_terminated_length": 9726.1875, |
| "completions/min_length": 4384.0, |
| "completions/min_terminated_length": 4384.0, |
| "entropy": 0.2942858459427953, |
| "epoch": 1.6097560975609757, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.34428897500038147, |
| "learning_rate": 2.12e-07, |
| "loss": 0.0091, |
| "num_tokens": 66231221.0, |
| "reward": 0.8734248280525208, |
| "reward_std": 0.6437625288963318, |
| "rewards/reward_func/mean": 0.8734248280525208, |
| "rewards/reward_func/std": 0.6437625288963318, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2832908630371094, |
| "sampling/importance_sampling_ratio/mean": 0.9896547794342041, |
| "sampling/importance_sampling_ratio/min": 0.2729717493057251, |
| "sampling/sampling_logp_difference/max": 1.298387050628662, |
| "sampling/sampling_logp_difference/mean": 0.020584698766469955, |
| "step": 198, |
| "step_time": 540.5545430611819 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12849.0, |
| "completions/max_terminated_length": 12849.0, |
| "completions/mean_length": 9709.1875, |
| "completions/mean_terminated_length": 9709.1875, |
| "completions/min_length": 6938.0, |
| "completions/min_terminated_length": 6938.0, |
| "entropy": 0.29280679672956467, |
| "epoch": 1.6178861788617886, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.40767934918403625, |
| "learning_rate": 2.0799999999999998e-07, |
| "loss": 0.0156, |
| "num_tokens": 66560883.0, |
| "reward": 0.6801372766494751, |
| "reward_std": 0.6403651237487793, |
| "rewards/reward_func/mean": 0.6801372766494751, |
| "rewards/reward_func/std": 0.6403650641441345, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7840784788131714, |
| "sampling/importance_sampling_ratio/mean": 0.989808201789856, |
| "sampling/importance_sampling_ratio/min": 0.3189617097377777, |
| "sampling/sampling_logp_difference/max": 1.1426842212677002, |
| "sampling/sampling_logp_difference/mean": 0.020620396360754967, |
| "step": 199, |
| "step_time": 288.9684821246192 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15979.0, |
| "completions/max_terminated_length": 15979.0, |
| "completions/mean_length": 10304.9375, |
| "completions/mean_terminated_length": 10304.9375, |
| "completions/min_length": 3889.0, |
| "completions/min_terminated_length": 3889.0, |
| "entropy": 0.27815717831254005, |
| "epoch": 1.6260162601626016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38130983710289, |
| "learning_rate": 2.0399999999999997e-07, |
| "loss": 0.0202, |
| "num_tokens": 66909617.0, |
| "reward": 1.4418128728866577, |
| "reward_std": 1.1881235837936401, |
| "rewards/reward_func/mean": 1.4418128728866577, |
| "rewards/reward_func/std": 1.1881235837936401, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2451488971710205, |
| "sampling/importance_sampling_ratio/mean": 0.9904321432113647, |
| "sampling/importance_sampling_ratio/min": 0.3691475987434387, |
| "sampling/sampling_logp_difference/max": 0.9965587258338928, |
| "sampling/sampling_logp_difference/mean": 0.019482022151350975, |
| "step": 200, |
| "step_time": 518.1173423542641 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13914.0, |
| "completions/max_terminated_length": 13914.0, |
| "completions/mean_length": 9100.125, |
| "completions/mean_terminated_length": 9100.125, |
| "completions/min_length": 5304.0, |
| "completions/min_terminated_length": 5304.0, |
| "entropy": 0.27993758488446474, |
| "epoch": 1.6341463414634148, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.374416708946228, |
| "learning_rate": 2e-07, |
| "loss": 0.0004, |
| "num_tokens": 67219981.0, |
| "reward": 1.3626954555511475, |
| "reward_std": 0.2722192108631134, |
| "rewards/reward_func/mean": 1.3626954555511475, |
| "rewards/reward_func/std": 0.2722192406654358, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.924107074737549, |
| "sampling/importance_sampling_ratio/mean": 0.9901722073554993, |
| "sampling/importance_sampling_ratio/min": 0.24207624793052673, |
| "sampling/sampling_logp_difference/max": 1.4185025691986084, |
| "sampling/sampling_logp_difference/mean": 0.01998857781291008, |
| "step": 201, |
| "step_time": 314.7476293069776 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13731.0, |
| "completions/max_terminated_length": 13731.0, |
| "completions/mean_length": 9330.75, |
| "completions/mean_terminated_length": 9330.75, |
| "completions/min_length": 6053.0, |
| "completions/min_terminated_length": 6053.0, |
| "entropy": 0.2888748459517956, |
| "epoch": 1.6422764227642277, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3868502974510193, |
| "learning_rate": 1.96e-07, |
| "loss": 0.007, |
| "num_tokens": 67532997.0, |
| "reward": 1.3977718353271484, |
| "reward_std": 0.4272344708442688, |
| "rewards/reward_func/mean": 1.3977718353271484, |
| "rewards/reward_func/std": 0.4272345006465912, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.656132936477661, |
| "sampling/importance_sampling_ratio/mean": 0.9900428652763367, |
| "sampling/importance_sampling_ratio/min": 0.12062080949544907, |
| "sampling/sampling_logp_difference/max": 2.1151034832000732, |
| "sampling/sampling_logp_difference/mean": 0.020267413929104805, |
| "step": 202, |
| "step_time": 308.19105943036266 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14783.0, |
| "completions/max_terminated_length": 14783.0, |
| "completions/mean_length": 10748.21875, |
| "completions/mean_terminated_length": 10748.21875, |
| "completions/min_length": 6890.0, |
| "completions/min_terminated_length": 6890.0, |
| "entropy": 0.2793641071766615, |
| "epoch": 1.6504065040650406, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.5265854001045227, |
| "learning_rate": 1.92e-07, |
| "loss": -0.0092, |
| "num_tokens": 67896876.0, |
| "reward": 1.05764901638031, |
| "reward_std": 0.5356566309928894, |
| "rewards/reward_func/mean": 1.05764901638031, |
| "rewards/reward_func/std": 0.5356566309928894, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9288419485092163, |
| "sampling/importance_sampling_ratio/mean": 0.9903679490089417, |
| "sampling/importance_sampling_ratio/min": 0.373412162065506, |
| "sampling/sampling_logp_difference/max": 0.9850724935531616, |
| "sampling/sampling_logp_difference/mean": 0.019830968230962753, |
| "step": 203, |
| "step_time": 414.82067601103336 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16746.0, |
| "completions/max_terminated_length": 16746.0, |
| "completions/mean_length": 12129.65625, |
| "completions/mean_terminated_length": 12129.65625, |
| "completions/min_length": 7692.0, |
| "completions/min_terminated_length": 7692.0, |
| "entropy": 0.25799091067165136, |
| "epoch": 1.6585365853658538, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36031574010849, |
| "learning_rate": 1.88e-07, |
| "loss": 0.0602, |
| "num_tokens": 68311737.0, |
| "reward": 1.1385812759399414, |
| "reward_std": 0.9196106791496277, |
| "rewards/reward_func/mean": 1.1421899795532227, |
| "rewards/reward_func/std": 0.9190824627876282, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00360870361328125, |
| "rewards/soft_overlong_punishment_reward/std": 0.016192881390452385, |
| "sampling/importance_sampling_ratio/max": 2.586771011352539, |
| "sampling/importance_sampling_ratio/mean": 0.9909306168556213, |
| "sampling/importance_sampling_ratio/min": 0.07289119064807892, |
| "sampling/sampling_logp_difference/max": 2.6187875270843506, |
| "sampling/sampling_logp_difference/mean": 0.018782958388328552, |
| "step": 204, |
| "step_time": 588.7674697954208 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14103.0, |
| "completions/max_terminated_length": 14103.0, |
| "completions/mean_length": 9646.96875, |
| "completions/mean_terminated_length": 9646.96875, |
| "completions/min_length": 5721.0, |
| "completions/min_terminated_length": 5721.0, |
| "entropy": 0.2763592302799225, |
| "epoch": 1.6666666666666665, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3763327896595001, |
| "learning_rate": 1.8399999999999998e-07, |
| "loss": 0.0166, |
| "num_tokens": 68639720.0, |
| "reward": 1.1393167972564697, |
| "reward_std": 0.49497222900390625, |
| "rewards/reward_func/mean": 1.1393167972564697, |
| "rewards/reward_func/std": 0.49497222900390625, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.4610517024993896, |
| "sampling/importance_sampling_ratio/mean": 0.9903528094291687, |
| "sampling/importance_sampling_ratio/min": 0.13930465281009674, |
| "sampling/sampling_logp_difference/max": 1.9710919857025146, |
| "sampling/sampling_logp_difference/mean": 0.019617252051830292, |
| "step": 205, |
| "step_time": 339.363127422519 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13168.0, |
| "completions/max_terminated_length": 13168.0, |
| "completions/mean_length": 8321.0625, |
| "completions/mean_terminated_length": 8321.0625, |
| "completions/min_length": 4609.0, |
| "completions/min_terminated_length": 4609.0, |
| "entropy": 0.29074352979660034, |
| "epoch": 1.6747967479674797, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.46806299686431885, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0141, |
| "num_tokens": 68923034.0, |
| "reward": 1.2282514572143555, |
| "reward_std": 0.945728063583374, |
| "rewards/reward_func/mean": 1.2282514572143555, |
| "rewards/reward_func/std": 0.945728063583374, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.9357402324676514, |
| "sampling/importance_sampling_ratio/mean": 0.9899877309799194, |
| "sampling/importance_sampling_ratio/min": 0.004849388264119625, |
| "sampling/sampling_logp_difference/max": 5.328902721405029, |
| "sampling/sampling_logp_difference/mean": 0.019961174577474594, |
| "step": 206, |
| "step_time": 335.27952317520976 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14498.0, |
| "completions/max_terminated_length": 14498.0, |
| "completions/mean_length": 9859.0, |
| "completions/mean_terminated_length": 9859.0, |
| "completions/min_length": 7080.0, |
| "completions/min_terminated_length": 7080.0, |
| "entropy": 0.2817118000239134, |
| "epoch": 1.6829268292682928, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41321122646331787, |
| "learning_rate": 1.76e-07, |
| "loss": -0.0156, |
| "num_tokens": 69258610.0, |
| "reward": 1.5615112781524658, |
| "reward_std": 1.2650940418243408, |
| "rewards/reward_func/mean": 1.5615112781524658, |
| "rewards/reward_func/std": 1.2650940418243408, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.48781681060791, |
| "sampling/importance_sampling_ratio/mean": 0.9902306795120239, |
| "sampling/importance_sampling_ratio/min": 0.09276595711708069, |
| "sampling/sampling_logp_difference/max": 2.3776755332946777, |
| "sampling/sampling_logp_difference/mean": 0.01979946345090866, |
| "step": 207, |
| "step_time": 412.5895248707384 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12872.0, |
| "completions/max_terminated_length": 12872.0, |
| "completions/mean_length": 8875.53125, |
| "completions/mean_terminated_length": 8875.53125, |
| "completions/min_length": 4162.0, |
| "completions/min_terminated_length": 4162.0, |
| "entropy": 0.28253381699323654, |
| "epoch": 1.6910569105691056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3998742699623108, |
| "learning_rate": 1.7199999999999998e-07, |
| "loss": -0.0407, |
| "num_tokens": 69556795.0, |
| "reward": 1.1288670301437378, |
| "reward_std": 0.5693311095237732, |
| "rewards/reward_func/mean": 1.1288670301437378, |
| "rewards/reward_func/std": 0.5693311095237732, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901490211486816, |
| "sampling/importance_sampling_ratio/min": 0.33106470108032227, |
| "sampling/sampling_logp_difference/max": 1.128915548324585, |
| "sampling/sampling_logp_difference/mean": 0.019885778427124023, |
| "step": 208, |
| "step_time": 287.43813130888157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14729.0, |
| "completions/max_terminated_length": 14729.0, |
| "completions/mean_length": 10587.9375, |
| "completions/mean_terminated_length": 10587.9375, |
| "completions/min_length": 7189.0, |
| "completions/min_terminated_length": 7189.0, |
| "entropy": 0.2850116267800331, |
| "epoch": 1.6991869918699187, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37196865677833557, |
| "learning_rate": 1.68e-07, |
| "loss": 0.0645, |
| "num_tokens": 69912961.0, |
| "reward": 1.4343760013580322, |
| "reward_std": 1.080871343612671, |
| "rewards/reward_func/mean": 1.4343760013580322, |
| "rewards/reward_func/std": 1.080871343612671, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901971220970154, |
| "sampling/importance_sampling_ratio/min": 0.3677317798137665, |
| "sampling/sampling_logp_difference/max": 1.0991556644439697, |
| "sampling/sampling_logp_difference/mean": 0.02008083090186119, |
| "step": 209, |
| "step_time": 352.02227776590735 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15064.0, |
| "completions/max_terminated_length": 15064.0, |
| "completions/mean_length": 11466.5, |
| "completions/mean_terminated_length": 11466.5, |
| "completions/min_length": 8096.0, |
| "completions/min_terminated_length": 8096.0, |
| "entropy": 0.25724725145846605, |
| "epoch": 1.7073170731707317, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3804565668106079, |
| "learning_rate": 1.64e-07, |
| "loss": -0.0099, |
| "num_tokens": 70307169.0, |
| "reward": 1.047875165939331, |
| "reward_std": 0.6655042767524719, |
| "rewards/reward_func/mean": 1.047875165939331, |
| "rewards/reward_func/std": 0.6655042767524719, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.398998260498047, |
| "sampling/importance_sampling_ratio/mean": 0.9909435510635376, |
| "sampling/importance_sampling_ratio/min": 0.037961445748806, |
| "sampling/sampling_logp_difference/max": 3.271184206008911, |
| "sampling/sampling_logp_difference/mean": 0.01853536069393158, |
| "step": 210, |
| "step_time": 377.21587240928784 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 18850.0, |
| "completions/max_terminated_length": 18850.0, |
| "completions/mean_length": 11776.53125, |
| "completions/mean_terminated_length": 11776.53125, |
| "completions/min_length": 8714.0, |
| "completions/min_terminated_length": 8714.0, |
| "entropy": 0.26618711929768324, |
| "epoch": 1.7154471544715446, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.34400245547294617, |
| "learning_rate": 1.6e-07, |
| "loss": 0.0338, |
| "num_tokens": 70713522.0, |
| "reward": 0.792148232460022, |
| "reward_std": 0.7291855812072754, |
| "rewards/reward_func/mean": 0.8139911890029907, |
| "rewards/reward_func/std": 0.701914370059967, |
| "rewards/soft_overlong_punishment_reward/mean": -0.02184295654296875, |
| "rewards/soft_overlong_punishment_reward/std": 0.10657885670661926, |
| "sampling/importance_sampling_ratio/max": 2.09429669380188, |
| "sampling/importance_sampling_ratio/mean": 0.9909163117408752, |
| "sampling/importance_sampling_ratio/min": 0.15467022359371185, |
| "sampling/sampling_logp_difference/max": 1.8664600849151611, |
| "sampling/sampling_logp_difference/mean": 0.018423916772007942, |
| "step": 211, |
| "step_time": 408.3389090951532 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14657.0, |
| "completions/max_terminated_length": 14657.0, |
| "completions/mean_length": 10683.625, |
| "completions/mean_terminated_length": 10683.625, |
| "completions/min_length": 5483.0, |
| "completions/min_terminated_length": 5483.0, |
| "entropy": 0.2850087806582451, |
| "epoch": 1.7235772357723578, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3700617551803589, |
| "learning_rate": 1.56e-07, |
| "loss": -0.0044, |
| "num_tokens": 71079998.0, |
| "reward": 1.0544310808181763, |
| "reward_std": 0.8633791208267212, |
| "rewards/reward_func/mean": 1.0544310808181763, |
| "rewards/reward_func/std": 0.8633792400360107, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0554580688476562, |
| "sampling/importance_sampling_ratio/mean": 0.990363597869873, |
| "sampling/importance_sampling_ratio/min": 0.038124918937683105, |
| "sampling/sampling_logp_difference/max": 3.2668871879577637, |
| "sampling/sampling_logp_difference/mean": 0.019684813916683197, |
| "step": 212, |
| "step_time": 422.1297117122449 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14373.0, |
| "completions/max_terminated_length": 14373.0, |
| "completions/mean_length": 9534.375, |
| "completions/mean_terminated_length": 9534.375, |
| "completions/min_length": 2458.0, |
| "completions/min_terminated_length": 2458.0, |
| "entropy": 0.28476100601255894, |
| "epoch": 1.7317073170731707, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4524003267288208, |
| "learning_rate": 1.5199999999999998e-07, |
| "loss": -0.031, |
| "num_tokens": 71408890.0, |
| "reward": 0.8519082069396973, |
| "reward_std": 0.6510614156723022, |
| "rewards/reward_func/mean": 0.8519082069396973, |
| "rewards/reward_func/std": 0.651061475276947, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9415626525878906, |
| "sampling/importance_sampling_ratio/mean": 0.9903674125671387, |
| "sampling/importance_sampling_ratio/min": 0.015267021022737026, |
| "sampling/sampling_logp_difference/max": 4.182060241699219, |
| "sampling/sampling_logp_difference/mean": 0.019843295216560364, |
| "step": 213, |
| "step_time": 335.98301385412924 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15211.0, |
| "completions/max_terminated_length": 15211.0, |
| "completions/mean_length": 9717.375, |
| "completions/mean_terminated_length": 9717.375, |
| "completions/min_length": 5821.0, |
| "completions/min_terminated_length": 5821.0, |
| "entropy": 0.2980188447982073, |
| "epoch": 1.7398373983739837, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3987669348716736, |
| "learning_rate": 1.4799999999999998e-07, |
| "loss": 0.0109, |
| "num_tokens": 71739526.0, |
| "reward": 1.5768582820892334, |
| "reward_std": 0.8727981448173523, |
| "rewards/reward_func/mean": 1.5768582820892334, |
| "rewards/reward_func/std": 0.8727982044219971, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.9979711771011353, |
| "sampling/importance_sampling_ratio/mean": 0.9897875785827637, |
| "sampling/importance_sampling_ratio/min": 0.42823854088783264, |
| "sampling/sampling_logp_difference/max": 0.8480749130249023, |
| "sampling/sampling_logp_difference/mean": 0.020624473690986633, |
| "step": 214, |
| "step_time": 444.9323614304885 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14265.0, |
| "completions/max_terminated_length": 14265.0, |
| "completions/mean_length": 10103.03125, |
| "completions/mean_terminated_length": 10103.03125, |
| "completions/min_length": 5647.0, |
| "completions/min_terminated_length": 5647.0, |
| "entropy": 0.293839693069458, |
| "epoch": 1.7479674796747968, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3844790756702423, |
| "learning_rate": 1.44e-07, |
| "loss": -0.0244, |
| "num_tokens": 72078767.0, |
| "reward": 2.067140817642212, |
| "reward_std": 2.6612210273742676, |
| "rewards/reward_func/mean": 2.067140817642212, |
| "rewards/reward_func/std": 2.6612212657928467, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.9357402324676514, |
| "sampling/importance_sampling_ratio/mean": 0.9895575046539307, |
| "sampling/importance_sampling_ratio/min": 0.04509971663355827, |
| "sampling/sampling_logp_difference/max": 3.098879337310791, |
| "sampling/sampling_logp_difference/mean": 0.020683787763118744, |
| "step": 215, |
| "step_time": 401.31944015505724 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17210.0, |
| "completions/max_terminated_length": 17210.0, |
| "completions/mean_length": 9679.5625, |
| "completions/mean_terminated_length": 9679.5625, |
| "completions/min_length": 3255.0, |
| "completions/min_terminated_length": 3255.0, |
| "entropy": 0.283124684356153, |
| "epoch": 1.7560975609756098, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3641822934150696, |
| "learning_rate": 1.4e-07, |
| "loss": 0.0163, |
| "num_tokens": 72407545.0, |
| "reward": 0.8389718532562256, |
| "reward_std": 0.8429340124130249, |
| "rewards/reward_func/mean": 0.8452737331390381, |
| "rewards/reward_func/std": 0.8356252908706665, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0063018798828125, |
| "rewards/soft_overlong_punishment_reward/std": 0.03564881905913353, |
| "sampling/importance_sampling_ratio/max": 2.0940728187561035, |
| "sampling/importance_sampling_ratio/mean": 0.9901467561721802, |
| "sampling/importance_sampling_ratio/min": 0.021902216598391533, |
| "sampling/sampling_logp_difference/max": 3.821167469024658, |
| "sampling/sampling_logp_difference/mean": 0.0196835920214653, |
| "step": 216, |
| "step_time": 491.90342234587297 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15762.0, |
| "completions/max_terminated_length": 15762.0, |
| "completions/mean_length": 10206.5, |
| "completions/mean_terminated_length": 10206.5, |
| "completions/min_length": 7317.0, |
| "completions/min_terminated_length": 7317.0, |
| "entropy": 0.28364065289497375, |
| "epoch": 1.7642276422764227, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38103634119033813, |
| "learning_rate": 1.36e-07, |
| "loss": -0.0001, |
| "num_tokens": 72752481.0, |
| "reward": 1.2134435176849365, |
| "reward_std": 0.40889275074005127, |
| "rewards/reward_func/mean": 1.2134435176849365, |
| "rewards/reward_func/std": 0.40889275074005127, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9898847341537476, |
| "sampling/importance_sampling_ratio/min": 0.153355211019516, |
| "sampling/sampling_logp_difference/max": 1.8749984502792358, |
| "sampling/sampling_logp_difference/mean": 0.020069871097803116, |
| "step": 217, |
| "step_time": 341.48434051219374 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16974.0, |
| "completions/max_terminated_length": 16974.0, |
| "completions/mean_length": 9566.84375, |
| "completions/mean_terminated_length": 9566.84375, |
| "completions/min_length": 5823.0, |
| "completions/min_terminated_length": 5823.0, |
| "entropy": 0.2919788621366024, |
| "epoch": 1.7723577235772359, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.36829307675361633, |
| "learning_rate": 1.32e-07, |
| "loss": 0.022, |
| "num_tokens": 73086028.0, |
| "reward": 1.9240275621414185, |
| "reward_std": 2.6115894317626953, |
| "rewards/reward_func/mean": 1.928528904914856, |
| "rewards/reward_func/std": 2.6107239723205566, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0045013427734375, |
| "rewards/soft_overlong_punishment_reward/std": 0.025463439524173737, |
| "sampling/importance_sampling_ratio/max": 2.1823227405548096, |
| "sampling/importance_sampling_ratio/mean": 0.9898126125335693, |
| "sampling/importance_sampling_ratio/min": 0.03777245059609413, |
| "sampling/sampling_logp_difference/max": 3.2761752605438232, |
| "sampling/sampling_logp_difference/mean": 0.020412269979715347, |
| "step": 218, |
| "step_time": 350.4455557651818 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15711.0, |
| "completions/max_terminated_length": 15711.0, |
| "completions/mean_length": 11780.21875, |
| "completions/mean_terminated_length": 11780.21875, |
| "completions/min_length": 6152.0, |
| "completions/min_terminated_length": 6152.0, |
| "entropy": 0.24620082695037127, |
| "epoch": 1.7804878048780488, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3344440460205078, |
| "learning_rate": 1.28e-07, |
| "loss": -0.0446, |
| "num_tokens": 73489971.0, |
| "reward": 0.8368304967880249, |
| "reward_std": 1.1214935779571533, |
| "rewards/reward_func/mean": 0.8368304967880249, |
| "rewards/reward_func/std": 1.1214934587478638, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.538177490234375, |
| "sampling/importance_sampling_ratio/mean": 0.9915289878845215, |
| "sampling/importance_sampling_ratio/min": 0.10874148458242416, |
| "sampling/sampling_logp_difference/max": 2.2187819480895996, |
| "sampling/sampling_logp_difference/mean": 0.01787884533405304, |
| "step": 219, |
| "step_time": 595.3615495576523 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14569.0, |
| "completions/max_terminated_length": 14569.0, |
| "completions/mean_length": 10509.15625, |
| "completions/mean_terminated_length": 10509.15625, |
| "completions/min_length": 6042.0, |
| "completions/min_terminated_length": 6042.0, |
| "entropy": 0.27453439868986607, |
| "epoch": 1.7886178861788617, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3674945533275604, |
| "learning_rate": 1.24e-07, |
| "loss": 0.0284, |
| "num_tokens": 73844368.0, |
| "reward": 0.9465268850326538, |
| "reward_std": 0.6327102184295654, |
| "rewards/reward_func/mean": 0.9465268850326538, |
| "rewards/reward_func/std": 0.6327102184295654, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9903506636619568, |
| "sampling/importance_sampling_ratio/min": 0.3419025242328644, |
| "sampling/sampling_logp_difference/max": 1.3342759609222412, |
| "sampling/sampling_logp_difference/mean": 0.019706957042217255, |
| "step": 220, |
| "step_time": 572.6308616143651 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13662.0, |
| "completions/max_terminated_length": 13662.0, |
| "completions/mean_length": 8802.75, |
| "completions/mean_terminated_length": 8802.75, |
| "completions/min_length": 3935.0, |
| "completions/min_terminated_length": 3935.0, |
| "entropy": 0.27091467659920454, |
| "epoch": 1.796747967479675, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4277440905570984, |
| "learning_rate": 1.2e-07, |
| "loss": 0.055, |
| "num_tokens": 74147944.0, |
| "reward": 1.7628746032714844, |
| "reward_std": 1.1854737997055054, |
| "rewards/reward_func/mean": 1.7628746032714844, |
| "rewards/reward_func/std": 1.1854737997055054, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.934642791748047, |
| "sampling/importance_sampling_ratio/mean": 0.9906680583953857, |
| "sampling/importance_sampling_ratio/min": 0.030376434326171875, |
| "sampling/sampling_logp_difference/max": 3.4940881729125977, |
| "sampling/sampling_logp_difference/mean": 0.01885787770152092, |
| "step": 221, |
| "step_time": 350.9161086077802 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12490.0, |
| "completions/max_terminated_length": 12490.0, |
| "completions/mean_length": 7828.4375, |
| "completions/mean_terminated_length": 7828.4375, |
| "completions/min_length": 2446.0, |
| "completions/min_terminated_length": 2446.0, |
| "entropy": 0.2959641069173813, |
| "epoch": 1.8048780487804879, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4303174614906311, |
| "learning_rate": 1.16e-07, |
| "loss": 0.0128, |
| "num_tokens": 74414678.0, |
| "reward": 1.5567618608474731, |
| "reward_std": 1.3865270614624023, |
| "rewards/reward_func/mean": 1.5567618608474731, |
| "rewards/reward_func/std": 1.386527180671692, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.5652806758880615, |
| "sampling/importance_sampling_ratio/mean": 0.9897814989089966, |
| "sampling/importance_sampling_ratio/min": 0.29385438561439514, |
| "sampling/sampling_logp_difference/max": 1.2246708869934082, |
| "sampling/sampling_logp_difference/mean": 0.020555168390274048, |
| "step": 222, |
| "step_time": 305.8508344134316 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13579.0, |
| "completions/max_terminated_length": 13579.0, |
| "completions/mean_length": 10411.9375, |
| "completions/mean_terminated_length": 10411.9375, |
| "completions/min_length": 7539.0, |
| "completions/min_terminated_length": 7539.0, |
| "entropy": 0.2726015029475093, |
| "epoch": 1.8130081300813008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35976549983024597, |
| "learning_rate": 1.12e-07, |
| "loss": -0.0198, |
| "num_tokens": 74773260.0, |
| "reward": 1.0551838874816895, |
| "reward_std": 0.5486001968383789, |
| "rewards/reward_func/mean": 1.0551838874816895, |
| "rewards/reward_func/std": 0.5486001968383789, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9905358552932739, |
| "sampling/importance_sampling_ratio/min": 0.10280817747116089, |
| "sampling/sampling_logp_difference/max": 2.274890422821045, |
| "sampling/sampling_logp_difference/mean": 0.019155049696564674, |
| "step": 223, |
| "step_time": 345.6332434962969 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15476.0, |
| "completions/max_terminated_length": 15476.0, |
| "completions/mean_length": 10071.25, |
| "completions/mean_terminated_length": 10071.25, |
| "completions/min_length": 5875.0, |
| "completions/min_terminated_length": 5875.0, |
| "entropy": 0.2811074396595359, |
| "epoch": 1.821138211382114, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4005601704120636, |
| "learning_rate": 1.0799999999999999e-07, |
| "loss": 0.0105, |
| "num_tokens": 75125052.0, |
| "reward": 1.0827958583831787, |
| "reward_std": 0.6487172842025757, |
| "rewards/reward_func/mean": 1.0827958583831787, |
| "rewards/reward_func/std": 0.6487172842025757, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.8473902940750122, |
| "sampling/importance_sampling_ratio/mean": 0.990472674369812, |
| "sampling/importance_sampling_ratio/min": 0.23046469688415527, |
| "sampling/sampling_logp_difference/max": 1.4676575660705566, |
| "sampling/sampling_logp_difference/mean": 0.019458888098597527, |
| "step": 224, |
| "step_time": 387.86132500413805 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14500.0, |
| "completions/max_terminated_length": 14500.0, |
| "completions/mean_length": 10911.75, |
| "completions/mean_terminated_length": 10911.75, |
| "completions/min_length": 7969.0, |
| "completions/min_terminated_length": 7969.0, |
| "entropy": 0.261114988476038, |
| "epoch": 1.8292682926829267, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3554852306842804, |
| "learning_rate": 1.0399999999999999e-07, |
| "loss": -0.0222, |
| "num_tokens": 75501012.0, |
| "reward": 1.656463623046875, |
| "reward_std": 1.6964542865753174, |
| "rewards/reward_func/mean": 1.656463623046875, |
| "rewards/reward_func/std": 1.6964542865753174, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.8839166164398193, |
| "sampling/importance_sampling_ratio/mean": 0.9908844828605652, |
| "sampling/importance_sampling_ratio/min": 0.188667431473732, |
| "sampling/sampling_logp_difference/max": 1.667769432067871, |
| "sampling/sampling_logp_difference/mean": 0.018746506422758102, |
| "step": 225, |
| "step_time": 437.71810589171946 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15035.0, |
| "completions/max_terminated_length": 15035.0, |
| "completions/mean_length": 9346.78125, |
| "completions/mean_terminated_length": 9346.78125, |
| "completions/min_length": 5479.0, |
| "completions/min_terminated_length": 5479.0, |
| "entropy": 0.3042516093701124, |
| "epoch": 1.8373983739837398, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.42573827505111694, |
| "learning_rate": 1e-07, |
| "loss": -0.0004, |
| "num_tokens": 75815341.0, |
| "reward": 1.1006791591644287, |
| "reward_std": 0.7395225167274475, |
| "rewards/reward_func/mean": 1.1006791591644287, |
| "rewards/reward_func/std": 0.7395225167274475, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.487826347351074, |
| "sampling/importance_sampling_ratio/mean": 0.9894092082977295, |
| "sampling/importance_sampling_ratio/min": 0.45079898834228516, |
| "sampling/sampling_logp_difference/max": 0.9114093780517578, |
| "sampling/sampling_logp_difference/mean": 0.021217308938503265, |
| "step": 226, |
| "step_time": 330.92210000101477 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14943.0, |
| "completions/max_terminated_length": 14943.0, |
| "completions/mean_length": 9735.6875, |
| "completions/mean_terminated_length": 9735.6875, |
| "completions/min_length": 5857.0, |
| "completions/min_terminated_length": 5857.0, |
| "entropy": 0.3187424521893263, |
| "epoch": 1.845528455284553, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4303300678730011, |
| "learning_rate": 9.6e-08, |
| "loss": 0.0728, |
| "num_tokens": 76140595.0, |
| "reward": 0.9577392339706421, |
| "reward_std": 0.7074591517448425, |
| "rewards/reward_func/mean": 0.9577392339706421, |
| "rewards/reward_func/std": 0.7074590921401978, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.359483003616333, |
| "sampling/importance_sampling_ratio/mean": 0.9887793660163879, |
| "sampling/importance_sampling_ratio/min": 0.19865137338638306, |
| "sampling/sampling_logp_difference/max": 1.6162039041519165, |
| "sampling/sampling_logp_difference/mean": 0.022178202867507935, |
| "step": 227, |
| "step_time": 337.2866434077732 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14953.0, |
| "completions/max_terminated_length": 14953.0, |
| "completions/mean_length": 9333.96875, |
| "completions/mean_terminated_length": 9333.96875, |
| "completions/min_length": 4264.0, |
| "completions/min_terminated_length": 4264.0, |
| "entropy": 0.28454318083822727, |
| "epoch": 1.8536585365853657, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.398489773273468, |
| "learning_rate": 9.199999999999999e-08, |
| "loss": 0.0021, |
| "num_tokens": 76460146.0, |
| "reward": 1.9499032497406006, |
| "reward_std": 1.7529157400131226, |
| "rewards/reward_func/mean": 1.9499032497406006, |
| "rewards/reward_func/std": 1.7529157400131226, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.649365186691284, |
| "sampling/importance_sampling_ratio/mean": 0.9902278780937195, |
| "sampling/importance_sampling_ratio/min": 0.0009075120906345546, |
| "sampling/sampling_logp_difference/max": 7.004803657531738, |
| "sampling/sampling_logp_difference/mean": 0.020048227161169052, |
| "step": 228, |
| "step_time": 354.2017886976246 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15784.0, |
| "completions/max_terminated_length": 15784.0, |
| "completions/mean_length": 11829.5625, |
| "completions/mean_terminated_length": 11829.5625, |
| "completions/min_length": 7684.0, |
| "completions/min_terminated_length": 7684.0, |
| "entropy": 0.26049436163157225, |
| "epoch": 1.8617886178861789, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35490682721138, |
| "learning_rate": 8.8e-08, |
| "loss": 0.0044, |
| "num_tokens": 76869980.0, |
| "reward": 0.8487756252288818, |
| "reward_std": 0.6379562020301819, |
| "rewards/reward_func/mean": 0.8487756252288818, |
| "rewards/reward_func/std": 0.6379562020301819, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9911534786224365, |
| "sampling/importance_sampling_ratio/min": 0.10822553932666779, |
| "sampling/sampling_logp_difference/max": 2.2235379219055176, |
| "sampling/sampling_logp_difference/mean": 0.018788527697324753, |
| "step": 229, |
| "step_time": 401.4078663017135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 18127.0, |
| "completions/max_terminated_length": 18127.0, |
| "completions/mean_length": 10958.9375, |
| "completions/mean_terminated_length": 10958.9375, |
| "completions/min_length": 6603.0, |
| "completions/min_terminated_length": 6603.0, |
| "entropy": 0.2978894282132387, |
| "epoch": 1.8699186991869918, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3882869482040405, |
| "learning_rate": 8.4e-08, |
| "loss": 0.0189, |
| "num_tokens": 77235146.0, |
| "reward": 1.1151258945465088, |
| "reward_std": 1.0892480611801147, |
| "rewards/reward_func/mean": 1.1284239292144775, |
| "rewards/reward_func/std": 1.0722978115081787, |
| "rewards/soft_overlong_punishment_reward/mean": -0.01329803466796875, |
| "rewards/soft_overlong_punishment_reward/std": 0.07522504031658173, |
| "sampling/importance_sampling_ratio/max": 2.948336601257324, |
| "sampling/importance_sampling_ratio/mean": 0.9897523522377014, |
| "sampling/importance_sampling_ratio/min": 0.3448498249053955, |
| "sampling/sampling_logp_difference/max": 1.0812411308288574, |
| "sampling/sampling_logp_difference/mean": 0.021154817193746567, |
| "step": 230, |
| "step_time": 382.1135653653182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14865.0, |
| "completions/max_terminated_length": 14865.0, |
| "completions/mean_length": 10480.5, |
| "completions/mean_terminated_length": 10480.5, |
| "completions/min_length": 7920.0, |
| "completions/min_terminated_length": 7920.0, |
| "entropy": 0.26645700354129076, |
| "epoch": 1.8780487804878048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3675365149974823, |
| "learning_rate": 8e-08, |
| "loss": 0.0104, |
| "num_tokens": 77595442.0, |
| "reward": 1.146174669265747, |
| "reward_std": 0.4796574115753174, |
| "rewards/reward_func/mean": 1.146174669265747, |
| "rewards/reward_func/std": 0.47965744137763977, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9905895590782166, |
| "sampling/importance_sampling_ratio/min": 0.06983362138271332, |
| "sampling/sampling_logp_difference/max": 2.66163969039917, |
| "sampling/sampling_logp_difference/mean": 0.01905234530568123, |
| "step": 231, |
| "step_time": 409.8225920125842 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13849.0, |
| "completions/max_terminated_length": 13849.0, |
| "completions/mean_length": 10798.125, |
| "completions/mean_terminated_length": 10798.125, |
| "completions/min_length": 8638.0, |
| "completions/min_terminated_length": 8638.0, |
| "entropy": 0.30643040873110294, |
| "epoch": 1.886178861788618, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.3219410479068756, |
| "learning_rate": 7.599999999999999e-08, |
| "loss": -0.0163, |
| "num_tokens": 77959222.0, |
| "reward": 0.9619793891906738, |
| "reward_std": 0.6255706548690796, |
| "rewards/reward_func/mean": 0.9619793891906738, |
| "rewards/reward_func/std": 0.6255706548690796, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.219569444656372, |
| "sampling/importance_sampling_ratio/mean": 0.9891918301582336, |
| "sampling/importance_sampling_ratio/min": 0.1252058893442154, |
| "sampling/sampling_logp_difference/max": 2.0777957439422607, |
| "sampling/sampling_logp_difference/mean": 0.021789079532027245, |
| "step": 232, |
| "step_time": 394.94902472407557 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15015.0, |
| "completions/max_terminated_length": 15015.0, |
| "completions/mean_length": 9467.5625, |
| "completions/mean_terminated_length": 9467.5625, |
| "completions/min_length": 5704.0, |
| "completions/min_terminated_length": 5704.0, |
| "entropy": 0.29639999382197857, |
| "epoch": 1.8943089430894309, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39320817589759827, |
| "learning_rate": 7.2e-08, |
| "loss": 0.0284, |
| "num_tokens": 78276592.0, |
| "reward": 1.2161970138549805, |
| "reward_std": 0.6799400448799133, |
| "rewards/reward_func/mean": 1.2161970138549805, |
| "rewards/reward_func/std": 0.6799400448799133, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.0554580688476562, |
| "sampling/importance_sampling_ratio/mean": 0.9896321296691895, |
| "sampling/importance_sampling_ratio/min": 0.18130598962306976, |
| "sampling/sampling_logp_difference/max": 1.7075691223144531, |
| "sampling/sampling_logp_difference/mean": 0.020897675305604935, |
| "step": 233, |
| "step_time": 352.86834441358224 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14053.0, |
| "completions/max_terminated_length": 14053.0, |
| "completions/mean_length": 8768.46875, |
| "completions/mean_terminated_length": 8768.46875, |
| "completions/min_length": 5297.0, |
| "completions/min_terminated_length": 5297.0, |
| "entropy": 0.3119491506367922, |
| "epoch": 1.9024390243902438, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4400003254413605, |
| "learning_rate": 6.8e-08, |
| "loss": 0.0034, |
| "num_tokens": 78571071.0, |
| "reward": 0.8220476508140564, |
| "reward_std": 0.7369102239608765, |
| "rewards/reward_func/mean": 0.8220476508140564, |
| "rewards/reward_func/std": 0.7369101643562317, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.8682268857955933, |
| "sampling/importance_sampling_ratio/mean": 0.9890620708465576, |
| "sampling/importance_sampling_ratio/min": 0.3670034110546112, |
| "sampling/sampling_logp_difference/max": 1.0023841857910156, |
| "sampling/sampling_logp_difference/mean": 0.02175344154238701, |
| "step": 234, |
| "step_time": 319.42806904623285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14845.0, |
| "completions/max_terminated_length": 14845.0, |
| "completions/mean_length": 9297.9375, |
| "completions/mean_terminated_length": 9297.9375, |
| "completions/min_length": 6124.0, |
| "completions/min_terminated_length": 6124.0, |
| "entropy": 0.3066948615014553, |
| "epoch": 1.910569105691057, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4045936167240143, |
| "learning_rate": 6.4e-08, |
| "loss": 0.0253, |
| "num_tokens": 78882533.0, |
| "reward": 1.8184316158294678, |
| "reward_std": 2.019770383834839, |
| "rewards/reward_func/mean": 1.8184316158294678, |
| "rewards/reward_func/std": 2.019770383834839, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.011228561401367, |
| "sampling/importance_sampling_ratio/mean": 0.9893174171447754, |
| "sampling/importance_sampling_ratio/min": 0.18020933866500854, |
| "sampling/sampling_logp_difference/max": 1.7136361598968506, |
| "sampling/sampling_logp_difference/mean": 0.021685179322957993, |
| "step": 235, |
| "step_time": 336.20116131124087 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12692.0, |
| "completions/max_terminated_length": 12692.0, |
| "completions/mean_length": 9302.34375, |
| "completions/mean_terminated_length": 9302.34375, |
| "completions/min_length": 5987.0, |
| "completions/min_terminated_length": 5987.0, |
| "entropy": 0.28985567949712276, |
| "epoch": 1.91869918699187, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.41015562415122986, |
| "learning_rate": 6e-08, |
| "loss": -0.0435, |
| "num_tokens": 79190928.0, |
| "reward": 1.1640218496322632, |
| "reward_std": 0.5124284029006958, |
| "rewards/reward_func/mean": 1.1640218496322632, |
| "rewards/reward_func/std": 0.5124284029006958, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1602694988250732, |
| "sampling/importance_sampling_ratio/mean": 0.9901226162910461, |
| "sampling/importance_sampling_ratio/min": 0.28592556715011597, |
| "sampling/sampling_logp_difference/max": 1.252023696899414, |
| "sampling/sampling_logp_difference/mean": 0.020301442593336105, |
| "step": 236, |
| "step_time": 299.4945896314457 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16871.0, |
| "completions/max_terminated_length": 16871.0, |
| "completions/mean_length": 11865.28125, |
| "completions/mean_terminated_length": 11865.28125, |
| "completions/min_length": 6579.0, |
| "completions/min_terminated_length": 6579.0, |
| "entropy": 0.26952589210122824, |
| "epoch": 1.9268292682926829, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33614400029182434, |
| "learning_rate": 5.6e-08, |
| "loss": 0.0191, |
| "num_tokens": 79590737.0, |
| "reward": 1.8198858499526978, |
| "reward_std": 4.182967662811279, |
| "rewards/reward_func/mean": 1.8236013650894165, |
| "rewards/reward_func/std": 4.181288719177246, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00371551513671875, |
| "rewards/soft_overlong_punishment_reward/std": 0.021018126979470253, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906592965126038, |
| "sampling/importance_sampling_ratio/min": 0.26234784722328186, |
| "sampling/sampling_logp_difference/max": 1.3380839824676514, |
| "sampling/sampling_logp_difference/mean": 0.01902042329311371, |
| "step": 237, |
| "step_time": 441.34759285254404 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15871.0, |
| "completions/max_terminated_length": 15871.0, |
| "completions/mean_length": 11183.9375, |
| "completions/mean_terminated_length": 11183.9375, |
| "completions/min_length": 6786.0, |
| "completions/min_terminated_length": 6786.0, |
| "entropy": 0.2905911058187485, |
| "epoch": 1.934959349593496, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3801893889904022, |
| "learning_rate": 5.1999999999999996e-08, |
| "loss": -0.0176, |
| "num_tokens": 79965479.0, |
| "reward": 0.8954036235809326, |
| "reward_std": 0.6015174388885498, |
| "rewards/reward_func/mean": 0.8954036235809326, |
| "rewards/reward_func/std": 0.6015174388885498, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.7719638347625732, |
| "sampling/importance_sampling_ratio/mean": 0.9899714589118958, |
| "sampling/importance_sampling_ratio/min": 0.08788038045167923, |
| "sampling/sampling_logp_difference/max": 2.4317786693573, |
| "sampling/sampling_logp_difference/mean": 0.020339064300060272, |
| "step": 238, |
| "step_time": 364.3018519633915 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15284.0, |
| "completions/max_terminated_length": 15284.0, |
| "completions/mean_length": 11272.84375, |
| "completions/mean_terminated_length": 11272.84375, |
| "completions/min_length": 7006.0, |
| "completions/min_terminated_length": 7006.0, |
| "entropy": 0.27012884337455034, |
| "epoch": 1.943089430894309, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3917267322540283, |
| "learning_rate": 4.8e-08, |
| "loss": 0.0152, |
| "num_tokens": 80347138.0, |
| "reward": 3.6909549236297607, |
| "reward_std": 5.090480327606201, |
| "rewards/reward_func/mean": 3.6909549236297607, |
| "rewards/reward_func/std": 5.090480804443359, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9906477332115173, |
| "sampling/importance_sampling_ratio/min": 0.17718777060508728, |
| "sampling/sampling_logp_difference/max": 1.7883415222167969, |
| "sampling/sampling_logp_difference/mean": 0.019167255610227585, |
| "step": 239, |
| "step_time": 477.38329637190327 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13860.0, |
| "completions/max_terminated_length": 13860.0, |
| "completions/mean_length": 10537.125, |
| "completions/mean_terminated_length": 10537.125, |
| "completions/min_length": 7223.0, |
| "completions/min_terminated_length": 7223.0, |
| "entropy": 0.2844161428511143, |
| "epoch": 1.951219512195122, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35918739438056946, |
| "learning_rate": 4.4e-08, |
| "loss": -0.0147, |
| "num_tokens": 80703366.0, |
| "reward": 1.0506994724273682, |
| "reward_std": 0.6146509647369385, |
| "rewards/reward_func/mean": 1.0506994724273682, |
| "rewards/reward_func/std": 0.6146509051322937, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.868777871131897, |
| "sampling/importance_sampling_ratio/mean": 0.9900673627853394, |
| "sampling/importance_sampling_ratio/min": 0.10262572020292282, |
| "sampling/sampling_logp_difference/max": 2.2766666412353516, |
| "sampling/sampling_logp_difference/mean": 0.01999824121594429, |
| "step": 240, |
| "step_time": 330.3526580410544 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 17687.0, |
| "completions/max_terminated_length": 17687.0, |
| "completions/mean_length": 11904.0, |
| "completions/mean_terminated_length": 11904.0, |
| "completions/min_length": 6393.0, |
| "completions/min_terminated_length": 6393.0, |
| "entropy": 0.2790603097528219, |
| "epoch": 1.959349593495935, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37855809926986694, |
| "learning_rate": 4e-08, |
| "loss": 0.0527, |
| "num_tokens": 81114366.0, |
| "reward": 0.9515074491500854, |
| "reward_std": 0.5839796662330627, |
| "rewards/reward_func/mean": 0.9614485502243042, |
| "rewards/reward_func/std": 0.588039755821228, |
| "rewards/soft_overlong_punishment_reward/mean": -0.00994110107421875, |
| "rewards/soft_overlong_punishment_reward/std": 0.056235358119010925, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9902161955833435, |
| "sampling/importance_sampling_ratio/min": 0.05338960140943527, |
| "sampling/sampling_logp_difference/max": 2.9301393032073975, |
| "sampling/sampling_logp_difference/mean": 0.01992051862180233, |
| "step": 241, |
| "step_time": 570.8844402271789 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16672.0, |
| "completions/max_terminated_length": 16672.0, |
| "completions/mean_length": 10269.0625, |
| "completions/mean_terminated_length": 10269.0625, |
| "completions/min_length": 4406.0, |
| "completions/min_terminated_length": 4406.0, |
| "entropy": 0.3053451906889677, |
| "epoch": 1.967479674796748, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.38739898800849915, |
| "learning_rate": 3.6e-08, |
| "loss": -0.0214, |
| "num_tokens": 81456344.0, |
| "reward": 1.5581142902374268, |
| "reward_std": 1.299147129058838, |
| "rewards/reward_func/mean": 1.5603115558624268, |
| "rewards/reward_func/std": 1.298970341682434, |
| "rewards/soft_overlong_punishment_reward/mean": -0.002197265625, |
| "rewards/soft_overlong_punishment_reward/std": 0.012429611757397652, |
| "sampling/importance_sampling_ratio/max": 1.9272154569625854, |
| "sampling/importance_sampling_ratio/mean": 0.9893536567687988, |
| "sampling/importance_sampling_ratio/min": 0.2590552270412445, |
| "sampling/sampling_logp_difference/max": 1.3507139682769775, |
| "sampling/sampling_logp_difference/mean": 0.021601153537631035, |
| "step": 242, |
| "step_time": 396.345472401008 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 13318.0, |
| "completions/max_terminated_length": 13318.0, |
| "completions/mean_length": 8007.4375, |
| "completions/mean_terminated_length": 8007.4375, |
| "completions/min_length": 4218.0, |
| "completions/min_terminated_length": 4218.0, |
| "entropy": 0.3001875299960375, |
| "epoch": 1.975609756097561, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.4350431263446808, |
| "learning_rate": 3.2e-08, |
| "loss": -0.0367, |
| "num_tokens": 81723414.0, |
| "reward": 1.3142027854919434, |
| "reward_std": 0.18320664763450623, |
| "rewards/reward_func/mean": 1.3142027854919434, |
| "rewards/reward_func/std": 0.18320664763450623, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.1279447078704834, |
| "sampling/importance_sampling_ratio/mean": 0.9896602034568787, |
| "sampling/importance_sampling_ratio/min": 0.34697410464286804, |
| "sampling/sampling_logp_difference/max": 1.0585050582885742, |
| "sampling/sampling_logp_difference/mean": 0.02109161950647831, |
| "step": 243, |
| "step_time": 296.47460809443146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15357.0, |
| "completions/max_terminated_length": 15357.0, |
| "completions/mean_length": 11084.78125, |
| "completions/mean_terminated_length": 11084.78125, |
| "completions/min_length": 7900.0, |
| "completions/min_terminated_length": 7900.0, |
| "entropy": 0.2727947048842907, |
| "epoch": 1.9837398373983741, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.748999297618866, |
| "learning_rate": 2.8e-08, |
| "loss": 0.0009, |
| "num_tokens": 82106623.0, |
| "reward": 1.7209361791610718, |
| "reward_std": 1.6489285230636597, |
| "rewards/reward_func/mean": 1.7209361791610718, |
| "rewards/reward_func/std": 1.6489285230636597, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9903358817100525, |
| "sampling/importance_sampling_ratio/min": 0.017168119549751282, |
| "sampling/sampling_logp_difference/max": 4.064701080322266, |
| "sampling/sampling_logp_difference/mean": 0.01944074034690857, |
| "step": 244, |
| "step_time": 410.3419267425779 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 14174.0, |
| "completions/max_terminated_length": 14174.0, |
| "completions/mean_length": 9691.53125, |
| "completions/mean_terminated_length": 9691.53125, |
| "completions/min_length": 6656.0, |
| "completions/min_terminated_length": 6656.0, |
| "entropy": 0.3040638938546181, |
| "epoch": 1.9918699186991868, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3910830318927765, |
| "learning_rate": 2.4e-08, |
| "loss": -0.0349, |
| "num_tokens": 82432072.0, |
| "reward": 1.075455665588379, |
| "reward_std": 0.8865517377853394, |
| "rewards/reward_func/mean": 1.075455665588379, |
| "rewards/reward_func/std": 0.8865517973899841, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 2.2025558948516846, |
| "sampling/importance_sampling_ratio/mean": 0.9893741011619568, |
| "sampling/importance_sampling_ratio/min": 0.3180047869682312, |
| "sampling/sampling_logp_difference/max": 1.145688772201538, |
| "sampling/sampling_logp_difference/mean": 0.02134772762656212, |
| "step": 245, |
| "step_time": 321.8410828395281 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15518.0, |
| "completions/max_terminated_length": 15518.0, |
| "completions/mean_length": 10144.40625, |
| "completions/mean_terminated_length": 10144.40625, |
| "completions/min_length": 5058.0, |
| "completions/min_terminated_length": 5058.0, |
| "entropy": 0.29126408137381077, |
| "epoch": 2.0, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.34645918011665344, |
| "learning_rate": 2e-08, |
| "loss": -0.0083, |
| "num_tokens": 82772749.0, |
| "reward": 1.1977554559707642, |
| "reward_std": 1.0357450246810913, |
| "rewards/reward_func/mean": 1.1977554559707642, |
| "rewards/reward_func/std": 1.0357449054718018, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9901622533798218, |
| "sampling/importance_sampling_ratio/min": 0.0009917415445670485, |
| "sampling/sampling_logp_difference/max": 6.916048049926758, |
| "sampling/sampling_logp_difference/mean": 0.020361589267849922, |
| "step": 246, |
| "step_time": 377.6803415892646 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16982.0, |
| "completions/max_terminated_length": 16982.0, |
| "completions/mean_length": 9476.9375, |
| "completions/mean_terminated_length": 9476.9375, |
| "completions/min_length": 3324.0, |
| "completions/min_terminated_length": 3324.0, |
| "entropy": 0.2707563489675522, |
| "epoch": 2.008130081300813, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.39088794589042664, |
| "learning_rate": 1.6e-08, |
| "loss": 0.0193, |
| "num_tokens": 83098347.0, |
| "reward": 1.1425082683563232, |
| "reward_std": 0.49591726064682007, |
| "rewards/reward_func/mean": 1.1470706462860107, |
| "rewards/reward_func/std": 0.4847003221511841, |
| "rewards/soft_overlong_punishment_reward/mean": -0.0045623779296875, |
| "rewards/soft_overlong_punishment_reward/std": 0.025808705016970634, |
| "sampling/importance_sampling_ratio/max": 2.0281832218170166, |
| "sampling/importance_sampling_ratio/mean": 0.9908162355422974, |
| "sampling/importance_sampling_ratio/min": 0.12059780210256577, |
| "sampling/sampling_logp_difference/max": 2.1152942180633545, |
| "sampling/sampling_logp_difference/mean": 0.018710073083639145, |
| "step": 247, |
| "step_time": 388.46342860814184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 15937.0, |
| "completions/max_terminated_length": 15937.0, |
| "completions/mean_length": 10819.40625, |
| "completions/mean_terminated_length": 10819.40625, |
| "completions/min_length": 5389.0, |
| "completions/min_terminated_length": 5389.0, |
| "entropy": 0.24684153776615858, |
| "epoch": 2.016260162601626, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.35437819361686707, |
| "learning_rate": 1.2e-08, |
| "loss": -0.026, |
| "num_tokens": 83472368.0, |
| "reward": 0.9455545544624329, |
| "reward_std": 0.6301490664482117, |
| "rewards/reward_func/mean": 0.9455545544624329, |
| "rewards/reward_func/std": 0.6301490664482117, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 0.9916300773620605, |
| "sampling/importance_sampling_ratio/min": 0.03407047688961029, |
| "sampling/sampling_logp_difference/max": 3.379323959350586, |
| "sampling/sampling_logp_difference/mean": 0.01745373196899891, |
| "step": 248, |
| "step_time": 433.2920012548566 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 12313.0, |
| "completions/max_terminated_length": 12313.0, |
| "completions/mean_length": 8494.34375, |
| "completions/mean_terminated_length": 8494.34375, |
| "completions/min_length": 5617.0, |
| "completions/min_terminated_length": 5617.0, |
| "entropy": 0.3050242904573679, |
| "epoch": 2.024390243902439, |
| "frac_reward_zero_std": 0.25, |
| "grad_norm": 0.38736531138420105, |
| "learning_rate": 8e-09, |
| "loss": 0.0544, |
| "num_tokens": 83755963.0, |
| "reward": 1.0841058492660522, |
| "reward_std": 0.6200099587440491, |
| "rewards/reward_func/mean": 1.0841058492660522, |
| "rewards/reward_func/std": 0.6200099587440491, |
| "rewards/soft_overlong_punishment_reward/mean": 0.0, |
| "rewards/soft_overlong_punishment_reward/std": 0.0, |
| "sampling/importance_sampling_ratio/max": 1.7827914953231812, |
| "sampling/importance_sampling_ratio/mean": 0.9893178939819336, |
| "sampling/importance_sampling_ratio/min": 0.07585831731557846, |
| "sampling/sampling_logp_difference/max": 2.578887939453125, |
| "sampling/sampling_logp_difference/mean": 0.021333064883947372, |
| "step": 249, |
| "step_time": 296.1807095913682 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 16956.0, |
| "completions/max_terminated_length": 16956.0, |
| "completions/mean_length": 11429.84375, |
| "completions/mean_terminated_length": 11429.84375, |
| "completions/min_length": 7864.0, |
| "completions/min_terminated_length": 7864.0, |
| "entropy": 0.2977316789329052, |
| "epoch": 2.032520325203252, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3691120445728302, |
| "learning_rate": 4e-09, |
| "loss": -0.0215, |
| "num_tokens": 84138822.0, |
| "reward": 0.8533658981323242, |
| "reward_std": 0.6907368898391724, |
| "rewards/reward_func/mean": 0.8577299118041992, |
| "rewards/reward_func/std": 0.6934342384338379, |
| "rewards/soft_overlong_punishment_reward/mean": -0.004364013671875, |
| "rewards/soft_overlong_punishment_reward/std": 0.02468658797442913, |
| "sampling/importance_sampling_ratio/max": 2.4678995609283447, |
| "sampling/importance_sampling_ratio/mean": 0.9895965456962585, |
| "sampling/importance_sampling_ratio/min": 0.27175045013427734, |
| "sampling/sampling_logp_difference/max": 1.3028711080551147, |
| "sampling/sampling_logp_difference/mean": 0.02109983190894127, |
| "step": 250, |
| "step_time": 396.94350118376315 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 250, |
| "num_input_tokens_seen": 84138822, |
| "num_train_epochs": 3, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|