| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.00424, |
| "eval_steps": 500, |
| "global_step": 106, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9274.0, |
| "completions/max_terminated_length": 9274.0, |
| "completions/mean_length": 8544.65625, |
| "completions/mean_terminated_length": 8544.65625, |
| "completions/min_length": 5146.0, |
| "completions/min_terminated_length": 5146.0, |
| "entropy": 0.10155441798269749, |
| "epoch": 4e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3302725553512573, |
| "kl": 0.0, |
| "learning_rate": 0.0, |
| "loss": -0.0468, |
| "num_tokens": 300475.0, |
| "reward": -0.43501541018486023, |
| "reward_std": 0.25424131751060486, |
| "rewards/rollout_reward_func/mean": -0.43501541018486023, |
| "rewards/rollout_reward_func/std": 0.3262947201728821, |
| "sampling/importance_sampling_ratio/max": 1.4890494346618652, |
| "sampling/importance_sampling_ratio/mean": 0.9993953704833984, |
| "sampling/importance_sampling_ratio/min": 0.5384275317192078, |
| "sampling/sampling_logp_difference/max": 0.6191023588180542, |
| "sampling/sampling_logp_difference/mean": 0.009896567091345787, |
| "step": 1, |
| "step_time": 99.01410378399987 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9192.0, |
| "completions/max_terminated_length": 9192.0, |
| "completions/mean_length": 6776.0625, |
| "completions/mean_terminated_length": 6776.0625, |
| "completions/min_length": 296.0, |
| "completions/min_terminated_length": 296.0, |
| "entropy": 0.0874525704421103, |
| "epoch": 8e-05, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6579537391662598, |
| "kl": 0.0, |
| "learning_rate": 2.2857142857142855e-07, |
| "loss": -0.1968, |
| "num_tokens": 544729.0, |
| "reward": -0.2837252914905548, |
| "reward_std": 0.5439483523368835, |
| "rewards/rollout_reward_func/mean": -0.2837252914905548, |
| "rewards/rollout_reward_func/std": 0.6020905375480652, |
| "sampling/importance_sampling_ratio/max": 1.7989556789398193, |
| "sampling/importance_sampling_ratio/mean": 0.9998900890350342, |
| "sampling/importance_sampling_ratio/min": 0.45409083366394043, |
| "sampling/sampling_logp_difference/max": 0.7894580364227295, |
| "sampling/sampling_logp_difference/mean": 0.01010945439338684, |
| "step": 2, |
| "step_time": 87.14777932000197 |
| }, |
| { |
| "clip_ratio/high_max": 0.004603212466463447, |
| "clip_ratio/high_mean": 0.0023016062332317233, |
| "clip_ratio/low_mean": 0.0018652374274097383, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004166843631537631, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9212.0, |
| "completions/max_terminated_length": 9212.0, |
| "completions/mean_length": 7882.5625, |
| "completions/mean_terminated_length": 7882.5625, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "entropy": 0.09430563636124134, |
| "epoch": 0.00012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.605349063873291, |
| "kl": 0.0011047614534618333, |
| "learning_rate": 4.571428571428571e-07, |
| "loss": -0.1305, |
| "num_tokens": 823996.0, |
| "reward": -0.4043263792991638, |
| "reward_std": 0.3804655075073242, |
| "rewards/rollout_reward_func/mean": -0.4043263792991638, |
| "rewards/rollout_reward_func/std": 0.40240201354026794, |
| "sampling/importance_sampling_ratio/max": 1.976324200630188, |
| "sampling/importance_sampling_ratio/mean": 0.9999128580093384, |
| "sampling/importance_sampling_ratio/min": 0.3961387276649475, |
| "sampling/sampling_logp_difference/max": 0.9259908199310303, |
| "sampling/sampling_logp_difference/mean": 0.009135385975241661, |
| "step": 3, |
| "step_time": 94.82623126800036 |
| }, |
| { |
| "clip_ratio/high_max": 0.0036170141538605094, |
| "clip_ratio/high_mean": 0.0018085070769302547, |
| "clip_ratio/low_mean": 0.003806174616329372, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005614681693259627, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8937.0, |
| "completions/max_terminated_length": 8937.0, |
| "completions/mean_length": 7744.6875, |
| "completions/mean_terminated_length": 7744.6875, |
| "completions/min_length": 293.0, |
| "completions/min_terminated_length": 293.0, |
| "entropy": 0.1019767300458625, |
| "epoch": 0.00016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7453252077102661, |
| "kl": 0.001767605485298418, |
| "learning_rate": 6.857142857142857e-07, |
| "loss": -0.1059, |
| "num_tokens": 1099670.0, |
| "reward": -0.42792099714279175, |
| "reward_std": 0.40188515186309814, |
| "rewards/rollout_reward_func/mean": -0.42792099714279175, |
| "rewards/rollout_reward_func/std": 0.44791290163993835, |
| "sampling/importance_sampling_ratio/max": 2.167020559310913, |
| "sampling/importance_sampling_ratio/mean": 1.0016282796859741, |
| "sampling/importance_sampling_ratio/min": 0.5168222784996033, |
| "sampling/sampling_logp_difference/max": 0.7733532190322876, |
| "sampling/sampling_logp_difference/mean": 0.011834039352834225, |
| "step": 4, |
| "step_time": 93.63879696800086 |
| }, |
| { |
| "clip_ratio/high_max": 0.0066998383263126016, |
| "clip_ratio/high_mean": 0.0040601465734653175, |
| "clip_ratio/low_mean": 0.00286985796992667, |
| "clip_ratio/low_min": 0.0007102272938936949, |
| "clip_ratio/region_mean": 0.006930004572495818, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9072.0, |
| "completions/max_terminated_length": 9072.0, |
| "completions/mean_length": 7945.375, |
| "completions/mean_terminated_length": 7945.375, |
| "completions/min_length": 1143.0, |
| "completions/min_terminated_length": 1143.0, |
| "entropy": 0.10537219722755253, |
| "epoch": 0.0002, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5562094449996948, |
| "kl": 0.0011164914703840623, |
| "learning_rate": 9.142857142857142e-07, |
| "loss": -0.0856, |
| "num_tokens": 1381159.0, |
| "reward": -0.44396543502807617, |
| "reward_std": 0.2552921175956726, |
| "rewards/rollout_reward_func/mean": -0.44396543502807617, |
| "rewards/rollout_reward_func/std": 0.3181779980659485, |
| "sampling/importance_sampling_ratio/max": 1.9132328033447266, |
| "sampling/importance_sampling_ratio/mean": 0.9998804330825806, |
| "sampling/importance_sampling_ratio/min": 0.5040712952613831, |
| "sampling/sampling_logp_difference/max": 0.6850376129150391, |
| "sampling/sampling_logp_difference/mean": 0.011912481859326363, |
| "step": 5, |
| "step_time": 95.56573534999961 |
| }, |
| { |
| "clip_ratio/high_max": 0.005053992324974388, |
| "clip_ratio/high_mean": 0.002878119732486084, |
| "clip_ratio/low_mean": 0.001978989952476695, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00485710974317044, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9032.0, |
| "completions/max_terminated_length": 9032.0, |
| "completions/mean_length": 7585.09375, |
| "completions/mean_terminated_length": 7585.09375, |
| "completions/min_length": 297.0, |
| "completions/min_terminated_length": 297.0, |
| "entropy": 0.08556636655703187, |
| "epoch": 0.00024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3749723434448242, |
| "kl": 0.000942698896778893, |
| "learning_rate": 1.1428571428571428e-06, |
| "loss": -0.2173, |
| "num_tokens": 1651286.0, |
| "reward": -0.41596484184265137, |
| "reward_std": 0.29581165313720703, |
| "rewards/rollout_reward_func/mean": -0.41596484184265137, |
| "rewards/rollout_reward_func/std": 0.38508185744285583, |
| "sampling/importance_sampling_ratio/max": 2.4631097316741943, |
| "sampling/importance_sampling_ratio/mean": 1.0003317594528198, |
| "sampling/importance_sampling_ratio/min": 0.4878324866294861, |
| "sampling/sampling_logp_difference/max": 0.9014246463775635, |
| "sampling/sampling_logp_difference/mean": 0.009676506742835045, |
| "step": 6, |
| "step_time": 91.97893484500082 |
| }, |
| { |
| "clip_ratio/high_max": 0.007268621528055519, |
| "clip_ratio/high_mean": 0.004720250406535342, |
| "clip_ratio/low_mean": 0.0014985245070420206, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006218774913577363, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9210.0, |
| "completions/max_terminated_length": 9210.0, |
| "completions/mean_length": 8051.28125, |
| "completions/mean_terminated_length": 8051.28125, |
| "completions/min_length": 278.0, |
| "completions/min_terminated_length": 278.0, |
| "entropy": 0.11584594147279859, |
| "epoch": 0.00028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4247251749038696, |
| "kl": 0.0014235296439437661, |
| "learning_rate": 1.3714285714285715e-06, |
| "loss": -0.1468, |
| "num_tokens": 1936745.0, |
| "reward": -0.5240695476531982, |
| "reward_std": 0.3336127996444702, |
| "rewards/rollout_reward_func/mean": -0.5240695476531982, |
| "rewards/rollout_reward_func/std": 0.3330491781234741, |
| "sampling/importance_sampling_ratio/max": 1.7502962350845337, |
| "sampling/importance_sampling_ratio/mean": 0.9991633892059326, |
| "sampling/importance_sampling_ratio/min": 0.3610191345214844, |
| "sampling/sampling_logp_difference/max": 1.0188243389129639, |
| "sampling/sampling_logp_difference/mean": 0.011139116249978542, |
| "step": 7, |
| "step_time": 95.33587853800009 |
| }, |
| { |
| "clip_ratio/high_max": 0.007413623738102615, |
| "clip_ratio/high_mean": 0.0037068118690513074, |
| "clip_ratio/low_mean": 0.0014727154921274632, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005179527361178771, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9325.0, |
| "completions/max_terminated_length": 9325.0, |
| "completions/mean_length": 7462.8125, |
| "completions/mean_terminated_length": 7462.8125, |
| "completions/min_length": 640.0, |
| "completions/min_terminated_length": 640.0, |
| "entropy": 0.08036898588761687, |
| "epoch": 0.00032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6594958305358887, |
| "kl": 0.0006596859343517281, |
| "learning_rate": 1.6e-06, |
| "loss": -0.0556, |
| "num_tokens": 2203101.0, |
| "reward": -0.35628482699394226, |
| "reward_std": 0.4496096074581146, |
| "rewards/rollout_reward_func/mean": -0.35628482699394226, |
| "rewards/rollout_reward_func/std": 0.5717853307723999, |
| "sampling/importance_sampling_ratio/max": 1.823210597038269, |
| "sampling/importance_sampling_ratio/mean": 1.0012201070785522, |
| "sampling/importance_sampling_ratio/min": 0.6447485089302063, |
| "sampling/sampling_logp_difference/max": 0.6005990505218506, |
| "sampling/sampling_logp_difference/mean": 0.009084178134799004, |
| "step": 8, |
| "step_time": 91.30933555899946 |
| }, |
| { |
| "clip_ratio/high_max": 0.008599455235525966, |
| "clip_ratio/high_mean": 0.004299727617762983, |
| "clip_ratio/low_mean": 0.0007231474155560136, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005022875062422827, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9116.0, |
| "completions/max_terminated_length": 9116.0, |
| "completions/mean_length": 7942.28125, |
| "completions/mean_terminated_length": 7942.28125, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "entropy": 0.09118380001746118, |
| "epoch": 0.00036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2313306331634521, |
| "kl": 0.0009252334230041015, |
| "learning_rate": 1.8285714285714284e-06, |
| "loss": -0.2388, |
| "num_tokens": 2485072.0, |
| "reward": -0.5225930213928223, |
| "reward_std": 0.2826329469680786, |
| "rewards/rollout_reward_func/mean": -0.5225930213928223, |
| "rewards/rollout_reward_func/std": 0.29529422521591187, |
| "sampling/importance_sampling_ratio/max": 1.675890326499939, |
| "sampling/importance_sampling_ratio/mean": 0.998712420463562, |
| "sampling/importance_sampling_ratio/min": 0.4924771189689636, |
| "sampling/sampling_logp_difference/max": 0.7083072662353516, |
| "sampling/sampling_logp_difference/mean": 0.00942724198102951, |
| "step": 9, |
| "step_time": 95.65081479300034 |
| }, |
| { |
| "clip_ratio/high_max": 0.0057004125555977225, |
| "clip_ratio/high_mean": 0.0028502062777988613, |
| "clip_ratio/low_mean": 0.001784498308552429, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.00463470458635129, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9102.0, |
| "completions/max_terminated_length": 9102.0, |
| "completions/mean_length": 7725.46875, |
| "completions/mean_terminated_length": 7725.46875, |
| "completions/min_length": 1577.0, |
| "completions/min_terminated_length": 1577.0, |
| "entropy": 0.10202601249329746, |
| "epoch": 0.0004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2979556322097778, |
| "kl": 0.0008248507838288788, |
| "learning_rate": 2.057142857142857e-06, |
| "loss": 0.0504, |
| "num_tokens": 2759975.0, |
| "reward": -0.4137440621852875, |
| "reward_std": 0.4455605745315552, |
| "rewards/rollout_reward_func/mean": -0.4137440621852875, |
| "rewards/rollout_reward_func/std": 0.5681707859039307, |
| "sampling/importance_sampling_ratio/max": 1.804335355758667, |
| "sampling/importance_sampling_ratio/mean": 1.0012147426605225, |
| "sampling/importance_sampling_ratio/min": 0.42815232276916504, |
| "sampling/sampling_logp_difference/max": 0.8482762575149536, |
| "sampling/sampling_logp_difference/mean": 0.011439252644777298, |
| "step": 10, |
| "step_time": 94.30586862800192 |
| }, |
| { |
| "clip_ratio/high_max": 0.005768664239440113, |
| "clip_ratio/high_mean": 0.003239445824874565, |
| "clip_ratio/low_mean": 0.0021694422175642103, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005408888071542606, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9448.0, |
| "completions/max_terminated_length": 9448.0, |
| "completions/mean_length": 8284.78125, |
| "completions/mean_terminated_length": 8284.78125, |
| "completions/min_length": 3093.0, |
| "completions/min_terminated_length": 3093.0, |
| "entropy": 0.08923958241939545, |
| "epoch": 0.00044, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.318570613861084, |
| "kl": 0.0008534059998055454, |
| "learning_rate": 2.2857142857142856e-06, |
| "loss": -0.0948, |
| "num_tokens": 3052665.0, |
| "reward": -0.5442342162132263, |
| "reward_std": 0.23896004259586334, |
| "rewards/rollout_reward_func/mean": -0.5442342162132263, |
| "rewards/rollout_reward_func/std": 0.3197546601295471, |
| "sampling/importance_sampling_ratio/max": 1.7523086071014404, |
| "sampling/importance_sampling_ratio/mean": 1.0002540349960327, |
| "sampling/importance_sampling_ratio/min": 0.30823758244514465, |
| "sampling/sampling_logp_difference/max": 1.176884412765503, |
| "sampling/sampling_logp_difference/mean": 0.010090906172990799, |
| "step": 11, |
| "step_time": 99.13019177800106 |
| }, |
| { |
| "clip_ratio/high_max": 0.005701107264030725, |
| "clip_ratio/high_mean": 0.004259218374500051, |
| "clip_ratio/low_mean": 0.0021850839839316905, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006444302358431742, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9421.0, |
| "completions/max_terminated_length": 9421.0, |
| "completions/mean_length": 8058.28125, |
| "completions/mean_terminated_length": 8058.28125, |
| "completions/min_length": 2072.0, |
| "completions/min_terminated_length": 2072.0, |
| "entropy": 0.10937830060720444, |
| "epoch": 0.00048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4182522296905518, |
| "kl": 0.0009953968228728627, |
| "learning_rate": 2.5142857142857142e-06, |
| "loss": -0.0408, |
| "num_tokens": 3337753.0, |
| "reward": -0.47211629152297974, |
| "reward_std": 0.33418089151382446, |
| "rewards/rollout_reward_func/mean": -0.47211629152297974, |
| "rewards/rollout_reward_func/std": 0.3411623537540436, |
| "sampling/importance_sampling_ratio/max": 1.9659794569015503, |
| "sampling/importance_sampling_ratio/mean": 1.00056791305542, |
| "sampling/importance_sampling_ratio/min": 0.4571618437767029, |
| "sampling/sampling_logp_difference/max": 0.7827178239822388, |
| "sampling/sampling_logp_difference/mean": 0.011176912114024162, |
| "step": 12, |
| "step_time": 99.95414250199883 |
| }, |
| { |
| "clip_ratio/high_max": 0.006522299780044705, |
| "clip_ratio/high_mean": 0.00396738713607192, |
| "clip_ratio/low_mean": 0.003070202248636633, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007037589413812384, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9322.0, |
| "completions/max_terminated_length": 9322.0, |
| "completions/mean_length": 7458.75, |
| "completions/mean_terminated_length": 7458.75, |
| "completions/min_length": 404.0, |
| "completions/min_terminated_length": 404.0, |
| "entropy": 0.10976240411400795, |
| "epoch": 0.00052, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.612453579902649, |
| "kl": 0.0007661073350391234, |
| "learning_rate": 2.742857142857143e-06, |
| "loss": -0.2568, |
| "num_tokens": 3603216.0, |
| "reward": -0.5523585081100464, |
| "reward_std": 0.3823288381099701, |
| "rewards/rollout_reward_func/mean": -0.5523585081100464, |
| "rewards/rollout_reward_func/std": 0.4409903883934021, |
| "sampling/importance_sampling_ratio/max": 1.8682420253753662, |
| "sampling/importance_sampling_ratio/mean": 0.9997111558914185, |
| "sampling/importance_sampling_ratio/min": 0.6721150875091553, |
| "sampling/sampling_logp_difference/max": 0.6249978542327881, |
| "sampling/sampling_logp_difference/mean": 0.011141350492835045, |
| "step": 13, |
| "step_time": 91.4230538769998 |
| }, |
| { |
| "clip_ratio/high_max": 0.004288507916498929, |
| "clip_ratio/high_mean": 0.0021442539582494646, |
| "clip_ratio/low_mean": 0.0032669483334757388, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005411202291725203, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8990.0, |
| "completions/max_terminated_length": 8990.0, |
| "completions/mean_length": 7743.78125, |
| "completions/mean_terminated_length": 7743.78125, |
| "completions/min_length": 2922.0, |
| "completions/min_terminated_length": 2922.0, |
| "entropy": 0.09799129283055663, |
| "epoch": 0.00056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2676411867141724, |
| "kl": 0.0006998809913056903, |
| "learning_rate": 2.9714285714285716e-06, |
| "loss": -0.0431, |
| "num_tokens": 3878088.0, |
| "reward": -0.4721910059452057, |
| "reward_std": 0.4045681357383728, |
| "rewards/rollout_reward_func/mean": -0.4721910059452057, |
| "rewards/rollout_reward_func/std": 0.4375231862068176, |
| "sampling/importance_sampling_ratio/max": 1.5526798963546753, |
| "sampling/importance_sampling_ratio/mean": 0.9998083114624023, |
| "sampling/importance_sampling_ratio/min": 0.5838956236839294, |
| "sampling/sampling_logp_difference/max": 0.5380330085754395, |
| "sampling/sampling_logp_difference/mean": 0.009330052882432938, |
| "step": 14, |
| "step_time": 96.34136769400084 |
| }, |
| { |
| "clip_ratio/high_max": 0.0007102272938936949, |
| "clip_ratio/high_mean": 0.00035511364694684744, |
| "clip_ratio/low_mean": 0.001096121472073719, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0014512351190205663, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9132.0, |
| "completions/max_terminated_length": 9132.0, |
| "completions/mean_length": 8004.5625, |
| "completions/mean_terminated_length": 8004.5625, |
| "completions/min_length": 2190.0, |
| "completions/min_terminated_length": 2190.0, |
| "entropy": 0.10588292591273785, |
| "epoch": 0.0006, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2579617500305176, |
| "kl": 0.0006010868073644815, |
| "learning_rate": 3.2e-06, |
| "loss": -0.0555, |
| "num_tokens": 4160851.0, |
| "reward": -0.43845027685165405, |
| "reward_std": 0.39296823740005493, |
| "rewards/rollout_reward_func/mean": -0.43845027685165405, |
| "rewards/rollout_reward_func/std": 0.41991615295410156, |
| "sampling/importance_sampling_ratio/max": 1.5303796529769897, |
| "sampling/importance_sampling_ratio/mean": 0.9994367361068726, |
| "sampling/importance_sampling_ratio/min": 0.4505850076675415, |
| "sampling/sampling_logp_difference/max": 0.7972085475921631, |
| "sampling/sampling_logp_difference/mean": 0.010887149721384048, |
| "step": 15, |
| "step_time": 94.16652587299814 |
| }, |
| { |
| "clip_ratio/high_max": 0.005843322374857962, |
| "clip_ratio/high_mean": 0.002921661187428981, |
| "clip_ratio/low_mean": 0.001611912128282711, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004533573315711692, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9094.0, |
| "completions/max_terminated_length": 9094.0, |
| "completions/mean_length": 7290.71875, |
| "completions/mean_terminated_length": 7290.71875, |
| "completions/min_length": 399.0, |
| "completions/min_terminated_length": 399.0, |
| "entropy": 0.0855347712058574, |
| "epoch": 0.00064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.503034234046936, |
| "kl": 0.0009320054400632216, |
| "learning_rate": 3.428571428571428e-06, |
| "loss": -0.2363, |
| "num_tokens": 4421376.0, |
| "reward": -0.6369705200195312, |
| "reward_std": 0.26649945974349976, |
| "rewards/rollout_reward_func/mean": -0.6369705200195312, |
| "rewards/rollout_reward_func/std": 0.2727498710155487, |
| "sampling/importance_sampling_ratio/max": 2.165536403656006, |
| "sampling/importance_sampling_ratio/mean": 1.0002843141555786, |
| "sampling/importance_sampling_ratio/min": 0.2399691492319107, |
| "sampling/sampling_logp_difference/max": 1.4272449016571045, |
| "sampling/sampling_logp_difference/mean": 0.00943131186068058, |
| "step": 16, |
| "step_time": 91.92633632400157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0058416714309714735, |
| "clip_ratio/high_mean": 0.0029208357154857367, |
| "clip_ratio/low_mean": 0.0018002486322075129, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00472108434769325, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9424.0, |
| "completions/max_terminated_length": 9424.0, |
| "completions/mean_length": 7896.0, |
| "completions/mean_terminated_length": 7896.0, |
| "completions/min_length": 2734.0, |
| "completions/min_terminated_length": 2734.0, |
| "entropy": 0.08829498197883368, |
| "epoch": 0.00068, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.409907341003418, |
| "kl": 0.0010910468854490318, |
| "learning_rate": 3.657142857142857e-06, |
| "loss": -0.0325, |
| "num_tokens": 4700950.0, |
| "reward": -0.4402346909046173, |
| "reward_std": 0.4639027416706085, |
| "rewards/rollout_reward_func/mean": -0.4402346909046173, |
| "rewards/rollout_reward_func/std": 0.49393191933631897, |
| "sampling/importance_sampling_ratio/max": 2.308764696121216, |
| "sampling/importance_sampling_ratio/mean": 0.999572217464447, |
| "sampling/importance_sampling_ratio/min": 0.42815110087394714, |
| "sampling/sampling_logp_difference/max": 0.8482791185379028, |
| "sampling/sampling_logp_difference/mean": 0.010221365839242935, |
| "step": 17, |
| "step_time": 94.6907513449978 |
| }, |
| { |
| "clip_ratio/high_max": 0.005707252013962716, |
| "clip_ratio/high_mean": 0.0032087396539282054, |
| "clip_ratio/low_mean": 0.0003676470660138875, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003576386719942093, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9244.0, |
| "completions/max_terminated_length": 9244.0, |
| "completions/mean_length": 8353.9375, |
| "completions/mean_terminated_length": 8353.9375, |
| "completions/min_length": 3917.0, |
| "completions/min_terminated_length": 3917.0, |
| "entropy": 0.09095717361196876, |
| "epoch": 0.00072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5224615335464478, |
| "kl": 0.0014694390793010825, |
| "learning_rate": 3.885714285714286e-06, |
| "loss": -0.0395, |
| "num_tokens": 4995416.0, |
| "reward": -0.4383149743080139, |
| "reward_std": 0.2706039547920227, |
| "rewards/rollout_reward_func/mean": -0.4383149743080139, |
| "rewards/rollout_reward_func/std": 0.32969218492507935, |
| "sampling/importance_sampling_ratio/max": 2.5134615898132324, |
| "sampling/importance_sampling_ratio/mean": 1.0007150173187256, |
| "sampling/importance_sampling_ratio/min": 0.5002727508544922, |
| "sampling/sampling_logp_difference/max": 0.9216609001159668, |
| "sampling/sampling_logp_difference/mean": 0.010718216188251972, |
| "step": 18, |
| "step_time": 97.70541409099951 |
| }, |
| { |
| "clip_ratio/high_max": 0.01007883029524237, |
| "clip_ratio/high_mean": 0.00613380636787042, |
| "clip_ratio/low_mean": 0.002054276497801766, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008188082865672186, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9502.0, |
| "completions/max_terminated_length": 9502.0, |
| "completions/mean_length": 7494.6875, |
| "completions/mean_terminated_length": 7494.6875, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "entropy": 0.1125592899043113, |
| "epoch": 0.00076, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3465840816497803, |
| "kl": 0.001035056316140981, |
| "learning_rate": 4.114285714285714e-06, |
| "loss": -0.3483, |
| "num_tokens": 5262287.0, |
| "reward": -0.5773377418518066, |
| "reward_std": 0.2989773154258728, |
| "rewards/rollout_reward_func/mean": -0.5773377418518066, |
| "rewards/rollout_reward_func/std": 0.3251783847808838, |
| "sampling/importance_sampling_ratio/max": 1.4896283149719238, |
| "sampling/importance_sampling_ratio/mean": 0.998917281627655, |
| "sampling/importance_sampling_ratio/min": 0.4792267382144928, |
| "sampling/sampling_logp_difference/max": 0.7355813980102539, |
| "sampling/sampling_logp_difference/mean": 0.010768642649054527, |
| "step": 19, |
| "step_time": 92.93958369900156 |
| }, |
| { |
| "clip_ratio/high_max": 0.006031187484040856, |
| "clip_ratio/high_mean": 0.0033707073889672756, |
| "clip_ratio/low_mean": 0.002600287349196151, |
| "clip_ratio/low_min": 0.0014124744920991361, |
| "clip_ratio/region_mean": 0.0059709947381634265, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9373.0, |
| "completions/max_terminated_length": 9373.0, |
| "completions/mean_length": 7589.03125, |
| "completions/mean_terminated_length": 7589.03125, |
| "completions/min_length": 1505.0, |
| "completions/min_terminated_length": 1505.0, |
| "entropy": 0.0929885795339942, |
| "epoch": 0.0008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3409026861190796, |
| "kl": 0.0012185092678009823, |
| "learning_rate": 4.342857142857142e-06, |
| "loss": -0.1635, |
| "num_tokens": 5532862.0, |
| "reward": -0.38000091910362244, |
| "reward_std": 0.5382703542709351, |
| "rewards/rollout_reward_func/mean": -0.38000091910362244, |
| "rewards/rollout_reward_func/std": 0.5696430206298828, |
| "sampling/importance_sampling_ratio/max": 1.7224009037017822, |
| "sampling/importance_sampling_ratio/mean": 0.9997408390045166, |
| "sampling/importance_sampling_ratio/min": 0.43135055899620056, |
| "sampling/sampling_logp_difference/max": 0.8408341407775879, |
| "sampling/sampling_logp_difference/mean": 0.010231327265501022, |
| "step": 20, |
| "step_time": 93.35555667800236 |
| }, |
| { |
| "clip_ratio/high_max": 0.00801744224736467, |
| "clip_ratio/high_mean": 0.004008721123682335, |
| "clip_ratio/low_mean": 0.0037498018937185407, |
| "clip_ratio/low_min": 0.0007102272938936949, |
| "clip_ratio/region_mean": 0.007758523075608537, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9290.0, |
| "completions/max_terminated_length": 9290.0, |
| "completions/mean_length": 8283.6875, |
| "completions/mean_terminated_length": 8283.6875, |
| "completions/min_length": 5247.0, |
| "completions/min_terminated_length": 5247.0, |
| "entropy": 0.10184192517772317, |
| "epoch": 0.00084, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.558704137802124, |
| "kl": 0.0017424946527171414, |
| "learning_rate": 4.571428571428571e-06, |
| "loss": -0.0122, |
| "num_tokens": 5825083.0, |
| "reward": -0.4749671220779419, |
| "reward_std": 0.3252595067024231, |
| "rewards/rollout_reward_func/mean": -0.4749671220779419, |
| "rewards/rollout_reward_func/std": 0.3718957304954529, |
| "sampling/importance_sampling_ratio/max": 1.6438056230545044, |
| "sampling/importance_sampling_ratio/mean": 0.9992358684539795, |
| "sampling/importance_sampling_ratio/min": 0.4867466688156128, |
| "sampling/sampling_logp_difference/max": 0.7200114727020264, |
| "sampling/sampling_logp_difference/mean": 0.010186174884438515, |
| "step": 21, |
| "step_time": 97.14219132500148 |
| }, |
| { |
| "clip_ratio/high_max": 0.005766152869910002, |
| "clip_ratio/high_mean": 0.002883076434955001, |
| "clip_ratio/low_mean": 0.0018179319449700415, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004701008350821212, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9405.0, |
| "completions/max_terminated_length": 9405.0, |
| "completions/mean_length": 7702.15625, |
| "completions/mean_terminated_length": 7702.15625, |
| "completions/min_length": 2410.0, |
| "completions/min_terminated_length": 2410.0, |
| "entropy": 0.10855974955484271, |
| "epoch": 0.00088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.825653076171875, |
| "kl": 0.0012982330545128207, |
| "learning_rate": 4.8e-06, |
| "loss": -0.1233, |
| "num_tokens": 6098628.0, |
| "reward": -0.37962836027145386, |
| "reward_std": 0.4688373804092407, |
| "rewards/rollout_reward_func/mean": -0.37962836027145386, |
| "rewards/rollout_reward_func/std": 0.47551068663597107, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0009162425994873, |
| "sampling/importance_sampling_ratio/min": 0.3347569704055786, |
| "sampling/sampling_logp_difference/max": 1.1153497695922852, |
| "sampling/sampling_logp_difference/mean": 0.014124426990747452, |
| "step": 22, |
| "step_time": 93.53759804500078 |
| }, |
| { |
| "clip_ratio/high_max": 0.007469919160939753, |
| "clip_ratio/high_mean": 0.004086083208676428, |
| "clip_ratio/low_mean": 0.004067586531164125, |
| "clip_ratio/low_min": 0.0007183908019214869, |
| "clip_ratio/region_mean": 0.008153669739840552, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9131.0, |
| "completions/max_terminated_length": 9131.0, |
| "completions/mean_length": 7552.71875, |
| "completions/mean_terminated_length": 7552.71875, |
| "completions/min_length": 274.0, |
| "completions/min_terminated_length": 274.0, |
| "entropy": 0.10671317647211254, |
| "epoch": 0.00092, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3214221000671387, |
| "kl": 0.0012135217020841083, |
| "learning_rate": 5.0285714285714285e-06, |
| "loss": -0.0996, |
| "num_tokens": 6367409.0, |
| "reward": -0.3321595788002014, |
| "reward_std": 0.4558962881565094, |
| "rewards/rollout_reward_func/mean": -0.3321595788002014, |
| "rewards/rollout_reward_func/std": 0.47004231810569763, |
| "sampling/importance_sampling_ratio/max": 1.9132328033447266, |
| "sampling/importance_sampling_ratio/mean": 1.0005993843078613, |
| "sampling/importance_sampling_ratio/min": 0.6039242148399353, |
| "sampling/sampling_logp_difference/max": 0.648794412612915, |
| "sampling/sampling_logp_difference/mean": 0.01185811311006546, |
| "step": 23, |
| "step_time": 91.04886297199937 |
| }, |
| { |
| "clip_ratio/high_max": 0.005706682160962373, |
| "clip_ratio/high_mean": 0.0032298470905516297, |
| "clip_ratio/low_mean": 0.004950174450641498, |
| "clip_ratio/low_min": 0.0007183908019214869, |
| "clip_ratio/region_mean": 0.008180021541193128, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9173.0, |
| "completions/max_terminated_length": 9173.0, |
| "completions/mean_length": 7576.0, |
| "completions/mean_terminated_length": 7576.0, |
| "completions/min_length": 889.0, |
| "completions/min_terminated_length": 889.0, |
| "entropy": 0.08463638043031096, |
| "epoch": 0.00096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1648985147476196, |
| "kl": 0.0009409518825123087, |
| "learning_rate": 5.257142857142857e-06, |
| "loss": -0.2373, |
| "num_tokens": 6637311.0, |
| "reward": -0.5275511741638184, |
| "reward_std": 0.3658484220504761, |
| "rewards/rollout_reward_func/mean": -0.5275511741638184, |
| "rewards/rollout_reward_func/std": 0.35727164149284363, |
| "sampling/importance_sampling_ratio/max": 2.0391674041748047, |
| "sampling/importance_sampling_ratio/mean": 0.9986543655395508, |
| "sampling/importance_sampling_ratio/min": 0.4036279618740082, |
| "sampling/sampling_logp_difference/max": 0.9072617292404175, |
| "sampling/sampling_logp_difference/mean": 0.009868521243333817, |
| "step": 24, |
| "step_time": 93.86759965099827 |
| }, |
| { |
| "clip_ratio/high_max": 0.008623368688859046, |
| "clip_ratio/high_mean": 0.00466679799137637, |
| "clip_ratio/low_mean": 0.0014336399617604911, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006100437924033031, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8925.0, |
| "completions/max_terminated_length": 8925.0, |
| "completions/mean_length": 7419.5625, |
| "completions/mean_terminated_length": 7419.5625, |
| "completions/min_length": 395.0, |
| "completions/min_terminated_length": 395.0, |
| "entropy": 0.10826450167223811, |
| "epoch": 0.001, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5118001699447632, |
| "kl": 0.0012769073209710768, |
| "learning_rate": 5.485714285714286e-06, |
| "loss": -0.1327, |
| "num_tokens": 6901616.0, |
| "reward": -0.46326500177383423, |
| "reward_std": 0.30533546209335327, |
| "rewards/rollout_reward_func/mean": -0.46326500177383423, |
| "rewards/rollout_reward_func/std": 0.44885945320129395, |
| "sampling/importance_sampling_ratio/max": 1.874658226966858, |
| "sampling/importance_sampling_ratio/mean": 0.9994730949401855, |
| "sampling/importance_sampling_ratio/min": 0.25686314702033997, |
| "sampling/sampling_logp_difference/max": 1.359211802482605, |
| "sampling/sampling_logp_difference/mean": 0.012570840306580067, |
| "step": 25, |
| "step_time": 90.51520027400147 |
| }, |
| { |
| "clip_ratio/high_max": 0.00718123564729467, |
| "clip_ratio/high_mean": 0.003590617823647335, |
| "clip_ratio/low_mean": 0.003063533455133438, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006654151278780773, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9233.0, |
| "completions/max_terminated_length": 9233.0, |
| "completions/mean_length": 7398.21875, |
| "completions/mean_terminated_length": 7398.21875, |
| "completions/min_length": 432.0, |
| "completions/min_terminated_length": 432.0, |
| "entropy": 0.10546251107007265, |
| "epoch": 0.00104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.260907769203186, |
| "kl": 0.0011700783816195326, |
| "learning_rate": 5.7142857142857145e-06, |
| "loss": -0.1799, |
| "num_tokens": 7165128.0, |
| "reward": -0.43886110186576843, |
| "reward_std": 0.43460309505462646, |
| "rewards/rollout_reward_func/mean": -0.43886110186576843, |
| "rewards/rollout_reward_func/std": 0.5057812929153442, |
| "sampling/importance_sampling_ratio/max": 1.4404255151748657, |
| "sampling/importance_sampling_ratio/mean": 0.9987990856170654, |
| "sampling/importance_sampling_ratio/min": 0.42815157771110535, |
| "sampling/sampling_logp_difference/max": 0.8482780456542969, |
| "sampling/sampling_logp_difference/mean": 0.010892972350120544, |
| "step": 26, |
| "step_time": 92.59128134799994 |
| }, |
| { |
| "clip_ratio/high_max": 0.0057237689034081995, |
| "clip_ratio/high_mean": 0.0028618844517040998, |
| "clip_ratio/low_mean": 0.004686754720751196, |
| "clip_ratio/low_min": 0.0010416667209938169, |
| "clip_ratio/region_mean": 0.007548639201559126, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9370.0, |
| "completions/max_terminated_length": 9370.0, |
| "completions/mean_length": 8178.8125, |
| "completions/mean_terminated_length": 8178.8125, |
| "completions/min_length": 1876.0, |
| "completions/min_terminated_length": 1876.0, |
| "entropy": 0.08889654604718089, |
| "epoch": 0.00108, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5473191738128662, |
| "kl": 0.0017560607998348132, |
| "learning_rate": 5.942857142857143e-06, |
| "loss": -0.1515, |
| "num_tokens": 7453964.0, |
| "reward": -0.38067591190338135, |
| "reward_std": 0.30481967329978943, |
| "rewards/rollout_reward_func/mean": -0.38067591190338135, |
| "rewards/rollout_reward_func/std": 0.35617271065711975, |
| "sampling/importance_sampling_ratio/max": 1.6653558015823364, |
| "sampling/importance_sampling_ratio/mean": 0.9998430013656616, |
| "sampling/importance_sampling_ratio/min": 0.4598047733306885, |
| "sampling/sampling_logp_difference/max": 0.7769533395767212, |
| "sampling/sampling_logp_difference/mean": 0.009767385199666023, |
| "step": 27, |
| "step_time": 92.73414052499811 |
| }, |
| { |
| "clip_ratio/high_max": 0.006360859319102019, |
| "clip_ratio/high_mean": 0.0031804296595510095, |
| "clip_ratio/low_mean": 0.0023903494293335825, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005570779088884592, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9179.0, |
| "completions/max_terminated_length": 9179.0, |
| "completions/mean_length": 7466.21875, |
| "completions/mean_terminated_length": 7466.21875, |
| "completions/min_length": 873.0, |
| "completions/min_terminated_length": 873.0, |
| "entropy": 0.09552860073745251, |
| "epoch": 0.00112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2436789274215698, |
| "kl": 0.0010375778874731623, |
| "learning_rate": 6.171428571428571e-06, |
| "loss": -0.2215, |
| "num_tokens": 7719394.0, |
| "reward": -0.5598432421684265, |
| "reward_std": 0.38949286937713623, |
| "rewards/rollout_reward_func/mean": -0.5598432421684265, |
| "rewards/rollout_reward_func/std": 0.3775184154510498, |
| "sampling/importance_sampling_ratio/max": 1.9373853206634521, |
| "sampling/importance_sampling_ratio/mean": 1.0011639595031738, |
| "sampling/importance_sampling_ratio/min": 0.48319393396377563, |
| "sampling/sampling_logp_difference/max": 0.7273372411727905, |
| "sampling/sampling_logp_difference/mean": 0.010366151109337807, |
| "step": 28, |
| "step_time": 92.60433979299978 |
| }, |
| { |
| "clip_ratio/high_max": 0.007077141373883933, |
| "clip_ratio/high_mean": 0.0035385706869419664, |
| "clip_ratio/low_mean": 0.0018004352750722319, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005339005962014198, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9004.0, |
| "completions/max_terminated_length": 9004.0, |
| "completions/mean_length": 8326.46875, |
| "completions/mean_terminated_length": 8326.46875, |
| "completions/min_length": 5302.0, |
| "completions/min_terminated_length": 5302.0, |
| "entropy": 0.10070427041500807, |
| "epoch": 0.00116, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5207918882369995, |
| "kl": 0.0014244818194129039, |
| "learning_rate": 6.4e-06, |
| "loss": 0.0112, |
| "num_tokens": 8013061.0, |
| "reward": -0.39570820331573486, |
| "reward_std": 0.3965410888195038, |
| "rewards/rollout_reward_func/mean": -0.39570820331573486, |
| "rewards/rollout_reward_func/std": 0.45599400997161865, |
| "sampling/importance_sampling_ratio/max": 1.5399894714355469, |
| "sampling/importance_sampling_ratio/mean": 0.9994006156921387, |
| "sampling/importance_sampling_ratio/min": 0.6555483937263489, |
| "sampling/sampling_logp_difference/max": 0.4317755699157715, |
| "sampling/sampling_logp_difference/mean": 0.00899563729763031, |
| "step": 29, |
| "step_time": 96.32543996599998 |
| }, |
| { |
| "clip_ratio/high_max": 0.004940582090057433, |
| "clip_ratio/high_mean": 0.0024702910450287163, |
| "clip_ratio/low_mean": 0.0014245363418012857, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003894827386830002, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9184.0, |
| "completions/max_terminated_length": 9184.0, |
| "completions/mean_length": 7472.78125, |
| "completions/mean_terminated_length": 7472.78125, |
| "completions/min_length": 205.0, |
| "completions/min_terminated_length": 205.0, |
| "entropy": 0.0843133123125881, |
| "epoch": 0.0012, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9470192193984985, |
| "kl": 0.0012731589044960856, |
| "learning_rate": 6.628571428571428e-06, |
| "loss": -0.2329, |
| "num_tokens": 8278694.0, |
| "reward": -0.4546031951904297, |
| "reward_std": 0.37902095913887024, |
| "rewards/rollout_reward_func/mean": -0.4546031951904297, |
| "rewards/rollout_reward_func/std": 0.3981786370277405, |
| "sampling/importance_sampling_ratio/max": 2.6150405406951904, |
| "sampling/importance_sampling_ratio/mean": 1.0001695156097412, |
| "sampling/importance_sampling_ratio/min": 0.45047178864479065, |
| "sampling/sampling_logp_difference/max": 0.9612796306610107, |
| "sampling/sampling_logp_difference/mean": 0.009561501443386078, |
| "step": 30, |
| "step_time": 91.3648072499991 |
| }, |
| { |
| "clip_ratio/high_max": 0.007095419394318014, |
| "clip_ratio/high_mean": 0.003894931956892833, |
| "clip_ratio/low_mean": 0.0019353094103280455, |
| "clip_ratio/low_min": 0.000735294132027775, |
| "clip_ratio/region_mean": 0.005830241338117048, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9345.0, |
| "completions/max_terminated_length": 9345.0, |
| "completions/mean_length": 7978.6875, |
| "completions/mean_terminated_length": 7978.6875, |
| "completions/min_length": 1496.0, |
| "completions/min_terminated_length": 1496.0, |
| "entropy": 0.12459231936372817, |
| "epoch": 0.00124, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6051055192947388, |
| "kl": 0.001607223342944053, |
| "learning_rate": 6.857142857142856e-06, |
| "loss": -0.0711, |
| "num_tokens": 8561291.0, |
| "reward": -0.4515855014324188, |
| "reward_std": 0.3598511219024658, |
| "rewards/rollout_reward_func/mean": -0.4515855014324188, |
| "rewards/rollout_reward_func/std": 0.40270760655403137, |
| "sampling/importance_sampling_ratio/max": 1.798833966255188, |
| "sampling/importance_sampling_ratio/mean": 1.0015867948532104, |
| "sampling/importance_sampling_ratio/min": 0.6142684817314148, |
| "sampling/sampling_logp_difference/max": 0.5871386528015137, |
| "sampling/sampling_logp_difference/mean": 0.012068906798958778, |
| "step": 31, |
| "step_time": 94.2069850859998 |
| }, |
| { |
| "clip_ratio/high_max": 0.0036044081789441407, |
| "clip_ratio/high_mean": 0.0018022040894720703, |
| "clip_ratio/low_mean": 0.004805904318345711, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006608108436921611, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9188.0, |
| "completions/max_terminated_length": 9188.0, |
| "completions/mean_length": 7920.375, |
| "completions/mean_terminated_length": 7920.375, |
| "completions/min_length": 1912.0, |
| "completions/min_terminated_length": 1912.0, |
| "entropy": 0.10130815836600959, |
| "epoch": 0.00128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1719034910202026, |
| "kl": 0.0014112608332652599, |
| "learning_rate": 7.085714285714285e-06, |
| "loss": -0.1936, |
| "num_tokens": 8841736.0, |
| "reward": -0.4756479263305664, |
| "reward_std": 0.3400876522064209, |
| "rewards/rollout_reward_func/mean": -0.4756479263305664, |
| "rewards/rollout_reward_func/std": 0.39079946279525757, |
| "sampling/importance_sampling_ratio/max": 1.9659785032272339, |
| "sampling/importance_sampling_ratio/mean": 1.0011847019195557, |
| "sampling/importance_sampling_ratio/min": 0.5454869270324707, |
| "sampling/sampling_logp_difference/max": 0.675990104675293, |
| "sampling/sampling_logp_difference/mean": 0.010147813707590103, |
| "step": 32, |
| "step_time": 94.26557795399913 |
| }, |
| { |
| "clip_ratio/high_max": 0.00641977513441816, |
| "clip_ratio/high_mean": 0.003561011195415631, |
| "clip_ratio/low_mean": 0.002905456320149824, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006466467515565455, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9143.0, |
| "completions/max_terminated_length": 9143.0, |
| "completions/mean_length": 8356.90625, |
| "completions/mean_terminated_length": 8356.90625, |
| "completions/min_length": 7207.0, |
| "completions/min_terminated_length": 7207.0, |
| "entropy": 0.10661770938895643, |
| "epoch": 0.00132, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5199352502822876, |
| "kl": 0.0017544178053867654, |
| "learning_rate": 7.314285714285714e-06, |
| "loss": -0.0041, |
| "num_tokens": 9136084.0, |
| "reward": -0.4854487180709839, |
| "reward_std": 0.3774080276489258, |
| "rewards/rollout_reward_func/mean": -0.4854487180709839, |
| "rewards/rollout_reward_func/std": 0.4189002811908722, |
| "sampling/importance_sampling_ratio/max": 2.0858850479125977, |
| "sampling/importance_sampling_ratio/mean": 1.0001283884048462, |
| "sampling/importance_sampling_ratio/min": 0.5391538143157959, |
| "sampling/sampling_logp_difference/max": 0.7351932525634766, |
| "sampling/sampling_logp_difference/mean": 0.012622365728020668, |
| "step": 33, |
| "step_time": 98.72545758600063 |
| }, |
| { |
| "clip_ratio/high_max": 0.006451850291341543, |
| "clip_ratio/high_mean": 0.0032259251456707716, |
| "clip_ratio/low_mean": 0.0043309712782502174, |
| "clip_ratio/low_min": 0.0007267441833391786, |
| "clip_ratio/region_mean": 0.0075568964530248195, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9138.0, |
| "completions/max_terminated_length": 9138.0, |
| "completions/mean_length": 7346.3125, |
| "completions/mean_terminated_length": 7346.3125, |
| "completions/min_length": 653.0, |
| "completions/min_terminated_length": 653.0, |
| "entropy": 0.09478296199813485, |
| "epoch": 0.00136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5919150114059448, |
| "kl": 0.0016131439460878028, |
| "learning_rate": 7.542857142857142e-06, |
| "loss": -0.159, |
| "num_tokens": 9397980.0, |
| "reward": -0.44115152955055237, |
| "reward_std": 0.431715190410614, |
| "rewards/rollout_reward_func/mean": -0.44115152955055237, |
| "rewards/rollout_reward_func/std": 0.45346444845199585, |
| "sampling/importance_sampling_ratio/max": 2.316176414489746, |
| "sampling/importance_sampling_ratio/mean": 1.0013811588287354, |
| "sampling/importance_sampling_ratio/min": 0.4872380495071411, |
| "sampling/sampling_logp_difference/max": 0.8399176597595215, |
| "sampling/sampling_logp_difference/mean": 0.010870829224586487, |
| "step": 34, |
| "step_time": 94.50844769800096 |
| }, |
| { |
| "clip_ratio/high_max": 0.00862313958350569, |
| "clip_ratio/high_mean": 0.004311569791752845, |
| "clip_ratio/low_mean": 0.0021354027558118105, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0064469725475646555, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8829.0, |
| "completions/max_terminated_length": 8829.0, |
| "completions/mean_length": 6773.25, |
| "completions/mean_terminated_length": 6773.25, |
| "completions/min_length": 273.0, |
| "completions/min_terminated_length": 273.0, |
| "entropy": 0.09439847921021283, |
| "epoch": 0.0014, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2422112226486206, |
| "kl": 0.0016963164634944405, |
| "learning_rate": 7.771428571428572e-06, |
| "loss": -0.2231, |
| "num_tokens": 9641472.0, |
| "reward": -0.5031548142433167, |
| "reward_std": 0.4589303135871887, |
| "rewards/rollout_reward_func/mean": -0.5031548142433167, |
| "rewards/rollout_reward_func/std": 0.46338412165641785, |
| "sampling/importance_sampling_ratio/max": 1.4338102340698242, |
| "sampling/importance_sampling_ratio/mean": 0.9990464448928833, |
| "sampling/importance_sampling_ratio/min": 0.5403674840927124, |
| "sampling/sampling_logp_difference/max": 0.6155059337615967, |
| "sampling/sampling_logp_difference/mean": 0.00977895874530077, |
| "step": 35, |
| "step_time": 92.36479951699857 |
| }, |
| { |
| "clip_ratio/high_max": 0.007964848773553967, |
| "clip_ratio/high_mean": 0.004350071423687041, |
| "clip_ratio/low_mean": 0.002024611836532131, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006374683260219172, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9254.0, |
| "completions/max_terminated_length": 9254.0, |
| "completions/mean_length": 7325.6875, |
| "completions/mean_terminated_length": 7325.6875, |
| "completions/min_length": 418.0, |
| "completions/min_terminated_length": 418.0, |
| "entropy": 0.10633090999908745, |
| "epoch": 0.00144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1656460762023926, |
| "kl": 0.0023267651195055805, |
| "learning_rate": 8e-06, |
| "loss": -0.275, |
| "num_tokens": 9902557.0, |
| "reward": -0.4935969114303589, |
| "reward_std": 0.3179719150066376, |
| "rewards/rollout_reward_func/mean": -0.4935969114303589, |
| "rewards/rollout_reward_func/std": 0.473392516374588, |
| "sampling/importance_sampling_ratio/max": 1.6787704229354858, |
| "sampling/importance_sampling_ratio/mean": 1.0006248950958252, |
| "sampling/importance_sampling_ratio/min": 0.6142685413360596, |
| "sampling/sampling_logp_difference/max": 0.518061637878418, |
| "sampling/sampling_logp_difference/mean": 0.011818873696029186, |
| "step": 36, |
| "step_time": 88.57236199499948 |
| }, |
| { |
| "clip_ratio/high_max": 0.006417479307856411, |
| "clip_ratio/high_mean": 0.0032087396539282054, |
| "clip_ratio/low_mean": 0.0007312192174140364, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003939958871342242, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9355.0, |
| "completions/max_terminated_length": 9355.0, |
| "completions/mean_length": 7663.3125, |
| "completions/mean_terminated_length": 7663.3125, |
| "completions/min_length": 847.0, |
| "completions/min_terminated_length": 847.0, |
| "entropy": 0.08696980169042945, |
| "epoch": 0.00148, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.192537546157837, |
| "kl": 0.002410084724033368, |
| "learning_rate": 7.999888877348797e-06, |
| "loss": -0.2529, |
| "num_tokens": 10174044.0, |
| "reward": -0.5070594549179077, |
| "reward_std": 0.3443080186843872, |
| "rewards/rollout_reward_func/mean": -0.5070594549179077, |
| "rewards/rollout_reward_func/std": 0.4040925204753876, |
| "sampling/importance_sampling_ratio/max": 1.9132274389266968, |
| "sampling/importance_sampling_ratio/mean": 1.0009825229644775, |
| "sampling/importance_sampling_ratio/min": 0.598889946937561, |
| "sampling/sampling_logp_difference/max": 0.6487915515899658, |
| "sampling/sampling_logp_difference/mean": 0.009411752223968506, |
| "step": 37, |
| "step_time": 93.29507449899847 |
| }, |
| { |
| "clip_ratio/high_max": 0.002155932132154703, |
| "clip_ratio/high_mean": 0.001437161467038095, |
| "clip_ratio/low_mean": 0.0017123925790656358, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003149554046103731, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9064.0, |
| "completions/max_terminated_length": 9064.0, |
| "completions/mean_length": 7980.78125, |
| "completions/mean_terminated_length": 7980.78125, |
| "completions/min_length": 423.0, |
| "completions/min_terminated_length": 423.0, |
| "entropy": 0.09132296103052795, |
| "epoch": 0.00152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5600956678390503, |
| "kl": 0.0021186837166169425, |
| "learning_rate": 7.999555517627349e-06, |
| "loss": -0.0928, |
| "num_tokens": 10455930.0, |
| "reward": -0.37076419591903687, |
| "reward_std": 0.42336732149124146, |
| "rewards/rollout_reward_func/mean": -0.37076419591903687, |
| "rewards/rollout_reward_func/std": 0.44395267963409424, |
| "sampling/importance_sampling_ratio/max": 1.7510610818862915, |
| "sampling/importance_sampling_ratio/mean": 0.999518871307373, |
| "sampling/importance_sampling_ratio/min": 0.5154094696044922, |
| "sampling/sampling_logp_difference/max": 0.6627936363220215, |
| "sampling/sampling_logp_difference/mean": 0.00994320772588253, |
| "step": 38, |
| "step_time": 94.34694778699941 |
| }, |
| { |
| "clip_ratio/high_max": 0.0057840069639496505, |
| "clip_ratio/high_mean": 0.0032431271101813763, |
| "clip_ratio/low_mean": 0.002503328782040626, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005746455892222002, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9161.0, |
| "completions/max_terminated_length": 9161.0, |
| "completions/mean_length": 7555.75, |
| "completions/mean_terminated_length": 7555.75, |
| "completions/min_length": 1935.0, |
| "completions/min_terminated_length": 1935.0, |
| "entropy": 0.10114077711477876, |
| "epoch": 0.00156, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3555232286453247, |
| "kl": 0.004268854574547731, |
| "learning_rate": 7.998999945531534e-06, |
| "loss": -0.0645, |
| "num_tokens": 10724635.0, |
| "reward": -0.35600388050079346, |
| "reward_std": 0.4358783960342407, |
| "rewards/rollout_reward_func/mean": -0.35600388050079346, |
| "rewards/rollout_reward_func/std": 0.5173202753067017, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.0001956224441528, |
| "sampling/importance_sampling_ratio/min": 0.5086521506309509, |
| "sampling/sampling_logp_difference/max": 1.204916000366211, |
| "sampling/sampling_logp_difference/mean": 0.012541755102574825, |
| "step": 39, |
| "step_time": 92.03758510300031 |
| }, |
| { |
| "clip_ratio/high_max": 0.006485855847131461, |
| "clip_ratio/high_mean": 0.003962173970649019, |
| "clip_ratio/low_mean": 0.0059998030483257025, |
| "clip_ratio/low_min": 0.0007183908019214869, |
| "clip_ratio/region_mean": 0.009961976989870891, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9313.0, |
| "completions/max_terminated_length": 9313.0, |
| "completions/mean_length": 8137.46875, |
| "completions/mean_terminated_length": 8137.46875, |
| "completions/min_length": 1552.0, |
| "completions/min_terminated_length": 1552.0, |
| "entropy": 0.10354270855896175, |
| "epoch": 0.0016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2437281608581543, |
| "kl": 0.004692138581958716, |
| "learning_rate": 7.998222202219114e-06, |
| "loss": -0.1094, |
| "num_tokens": 11011586.0, |
| "reward": -0.476349800825119, |
| "reward_std": 0.2979031801223755, |
| "rewards/rollout_reward_func/mean": -0.476349800825119, |
| "rewards/rollout_reward_func/std": 0.29430100321769714, |
| "sampling/importance_sampling_ratio/max": 1.9385071992874146, |
| "sampling/importance_sampling_ratio/mean": 1.000066876411438, |
| "sampling/importance_sampling_ratio/min": 0.3966410756111145, |
| "sampling/sampling_logp_difference/max": 0.9247235059738159, |
| "sampling/sampling_logp_difference/mean": 0.011882856488227844, |
| "step": 40, |
| "step_time": 95.45463361300062 |
| }, |
| { |
| "clip_ratio/high_max": 0.005031014792621136, |
| "clip_ratio/high_mean": 0.002874702855478972, |
| "clip_ratio/low_mean": 0.0025365093897562474, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005411212187027559, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9126.0, |
| "completions/max_terminated_length": 9126.0, |
| "completions/mean_length": 7551.875, |
| "completions/mean_terminated_length": 7551.875, |
| "completions/min_length": 273.0, |
| "completions/min_terminated_length": 273.0, |
| "entropy": 0.10094616864807904, |
| "epoch": 0.00164, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2249935865402222, |
| "kl": 0.004580019838613225, |
| "learning_rate": 7.99722234530669e-06, |
| "loss": -0.1866, |
| "num_tokens": 11279946.0, |
| "reward": -0.509591817855835, |
| "reward_std": 0.4151650369167328, |
| "rewards/rollout_reward_func/mean": -0.509591817855835, |
| "rewards/rollout_reward_func/std": 0.47652074694633484, |
| "sampling/importance_sampling_ratio/max": 1.552679181098938, |
| "sampling/importance_sampling_ratio/mean": 0.9984216690063477, |
| "sampling/importance_sampling_ratio/min": 0.41414451599121094, |
| "sampling/sampling_logp_difference/max": 0.8815402984619141, |
| "sampling/sampling_logp_difference/mean": 0.009862950071692467, |
| "step": 41, |
| "step_time": 92.74472697800138 |
| }, |
| { |
| "clip_ratio/high_max": 0.00608739914605394, |
| "clip_ratio/high_mean": 0.0034028949739877135, |
| "clip_ratio/low_mean": 0.0022560179349966347, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005658912938088179, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9129.0, |
| "completions/max_terminated_length": 9129.0, |
| "completions/mean_length": 7584.25, |
| "completions/mean_terminated_length": 7584.25, |
| "completions/min_length": 272.0, |
| "completions/min_terminated_length": 272.0, |
| "entropy": 0.1042005839990452, |
| "epoch": 0.00168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3942543268203735, |
| "kl": 0.004878379874753591, |
| "learning_rate": 7.996000448865428e-06, |
| "loss": -0.1046, |
| "num_tokens": 11549745.0, |
| "reward": -0.33219966292381287, |
| "reward_std": 0.51850426197052, |
| "rewards/rollout_reward_func/mean": -0.33219966292381287, |
| "rewards/rollout_reward_func/std": 0.5476008653640747, |
| "sampling/importance_sampling_ratio/max": 1.732586145401001, |
| "sampling/importance_sampling_ratio/mean": 0.999406099319458, |
| "sampling/importance_sampling_ratio/min": 0.5086521506309509, |
| "sampling/sampling_logp_difference/max": 0.6759908199310303, |
| "sampling/sampling_logp_difference/mean": 0.011652151122689247, |
| "step": 42, |
| "step_time": 92.72314244600057 |
| }, |
| { |
| "clip_ratio/high_max": 0.0075053395121358335, |
| "clip_ratio/high_mean": 0.0037526697560679168, |
| "clip_ratio/low_mean": 0.003480347600998357, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007233017327962443, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9100.0, |
| "completions/max_terminated_length": 9100.0, |
| "completions/mean_length": 7485.375, |
| "completions/mean_terminated_length": 7485.375, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "entropy": 0.09614216513000429, |
| "epoch": 0.00172, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0772511959075928, |
| "kl": 0.00416894428053638, |
| "learning_rate": 7.994556603415576e-06, |
| "loss": -0.1628, |
| "num_tokens": 11816091.0, |
| "reward": -0.42178797721862793, |
| "reward_std": 0.4303983449935913, |
| "rewards/rollout_reward_func/mean": -0.42178797721862793, |
| "rewards/rollout_reward_func/std": 0.4538474678993225, |
| "sampling/importance_sampling_ratio/max": 1.6209644079208374, |
| "sampling/importance_sampling_ratio/mean": 1.0004045963287354, |
| "sampling/importance_sampling_ratio/min": 0.3242146670818329, |
| "sampling/sampling_logp_difference/max": 1.1263494491577148, |
| "sampling/sampling_logp_difference/mean": 0.011008251458406448, |
| "step": 43, |
| "step_time": 95.21691550700052 |
| }, |
| { |
| "clip_ratio/high_max": 0.004524656978901476, |
| "clip_ratio/high_mean": 0.0026095507491845638, |
| "clip_ratio/low_mean": 0.002854076214134693, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0054636269342154264, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9289.0, |
| "completions/max_terminated_length": 9289.0, |
| "completions/mean_length": 7716.84375, |
| "completions/mean_terminated_length": 7716.84375, |
| "completions/min_length": 910.0, |
| "completions/min_terminated_length": 910.0, |
| "entropy": 0.10308593721129, |
| "epoch": 0.00176, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.0850143432617188, |
| "kl": 0.008197117564122891, |
| "learning_rate": 7.992890915919757e-06, |
| "loss": -0.0659, |
| "num_tokens": 12089680.0, |
| "reward": -0.4302936792373657, |
| "reward_std": 0.48004305362701416, |
| "rewards/rollout_reward_func/mean": -0.4302936792373657, |
| "rewards/rollout_reward_func/std": 0.5400975346565247, |
| "sampling/importance_sampling_ratio/max": 2.6225600242614746, |
| "sampling/importance_sampling_ratio/mean": 0.9998188018798828, |
| "sampling/importance_sampling_ratio/min": 0.31274011731147766, |
| "sampling/sampling_logp_difference/max": 1.16238272190094, |
| "sampling/sampling_logp_difference/mean": 0.012810271233320236, |
| "step": 44, |
| "step_time": 93.50324885599821 |
| }, |
| { |
| "clip_ratio/high_max": 0.009332069545052946, |
| "clip_ratio/high_mean": 0.005380530434194952, |
| "clip_ratio/low_mean": 0.00249813572736457, |
| "clip_ratio/low_min": 0.0007102272938936949, |
| "clip_ratio/region_mean": 0.007878666161559522, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9215.0, |
| "completions/max_terminated_length": 9215.0, |
| "completions/mean_length": 7574.625, |
| "completions/mean_terminated_length": 7574.625, |
| "completions/min_length": 271.0, |
| "completions/min_terminated_length": 271.0, |
| "entropy": 0.10209884284995496, |
| "epoch": 0.0018, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.7118793725967407, |
| "kl": 0.012286211093851307, |
| "learning_rate": 7.991003509775045e-06, |
| "loss": -0.2223, |
| "num_tokens": 12358674.0, |
| "reward": -0.4782068431377411, |
| "reward_std": 0.3393690288066864, |
| "rewards/rollout_reward_func/mean": -0.4782068431377411, |
| "rewards/rollout_reward_func/std": 0.3350028097629547, |
| "sampling/importance_sampling_ratio/max": 2.010491371154785, |
| "sampling/importance_sampling_ratio/mean": 0.998539924621582, |
| "sampling/importance_sampling_ratio/min": 0.4432297348976135, |
| "sampling/sampling_logp_difference/max": 0.8136670589447021, |
| "sampling/sampling_logp_difference/mean": 0.013688696548342705, |
| "step": 45, |
| "step_time": 94.50348583800042 |
| }, |
| { |
| "clip_ratio/high_max": 0.006435320246964693, |
| "clip_ratio/high_mean": 0.0032176601234823465, |
| "clip_ratio/low_mean": 0.0035371377016417682, |
| "clip_ratio/low_min": 0.0014286180958151817, |
| "clip_ratio/region_mean": 0.006754797854227945, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8981.0, |
| "completions/max_terminated_length": 8981.0, |
| "completions/mean_length": 7618.03125, |
| "completions/mean_terminated_length": 7618.03125, |
| "completions/min_length": 296.0, |
| "completions/min_terminated_length": 296.0, |
| "entropy": 0.12036720756441355, |
| "epoch": 0.00184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.262148141860962, |
| "kl": 0.009586318075889722, |
| "learning_rate": 7.988894524803824e-06, |
| "loss": -0.1508, |
| "num_tokens": 12629063.0, |
| "reward": -0.47280675172805786, |
| "reward_std": 0.33904793858528137, |
| "rewards/rollout_reward_func/mean": -0.47280675172805786, |
| "rewards/rollout_reward_func/std": 0.44742149114608765, |
| "sampling/importance_sampling_ratio/max": 1.7463794946670532, |
| "sampling/importance_sampling_ratio/mean": 0.9998955726623535, |
| "sampling/importance_sampling_ratio/min": 0.4740566611289978, |
| "sampling/sampling_logp_difference/max": 0.7464284896850586, |
| "sampling/sampling_logp_difference/mean": 0.013707821257412434, |
| "step": 46, |
| "step_time": 91.94492842599993 |
| }, |
| { |
| "clip_ratio/high_max": 0.010757386451587081, |
| "clip_ratio/high_mean": 0.006088920519687235, |
| "clip_ratio/low_mean": 0.0021593490964733064, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008248269616160542, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9378.0, |
| "completions/max_terminated_length": 9378.0, |
| "completions/mean_length": 7946.34375, |
| "completions/mean_terminated_length": 7946.34375, |
| "completions/min_length": 882.0, |
| "completions/min_terminated_length": 882.0, |
| "entropy": 0.10261929128319025, |
| "epoch": 0.00188, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4796141386032104, |
| "kl": 0.007273106846696464, |
| "learning_rate": 7.986564117243426e-06, |
| "loss": -0.1533, |
| "num_tokens": 12909851.0, |
| "reward": -0.4102572202682495, |
| "reward_std": 0.2766183316707611, |
| "rewards/rollout_reward_func/mean": -0.4102572202682495, |
| "rewards/rollout_reward_func/std": 0.3281300961971283, |
| "sampling/importance_sampling_ratio/max": 2.5544402599334717, |
| "sampling/importance_sampling_ratio/mean": 0.9995360374450684, |
| "sampling/importance_sampling_ratio/min": 0.5378825068473816, |
| "sampling/sampling_logp_difference/max": 0.9378330707550049, |
| "sampling/sampling_logp_difference/mean": 0.011439410038292408, |
| "step": 47, |
| "step_time": 96.26667236799949 |
| }, |
| { |
| "clip_ratio/high_max": 0.007107149169314653, |
| "clip_ratio/high_mean": 0.003900796815287322, |
| "clip_ratio/low_mean": 0.0016345784824807197, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005535375326871872, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9092.0, |
| "completions/max_terminated_length": 9092.0, |
| "completions/mean_length": 7913.40625, |
| "completions/mean_terminated_length": 7913.40625, |
| "completions/min_length": 2022.0, |
| "completions/min_terminated_length": 2022.0, |
| "entropy": 0.1097751297056675, |
| "epoch": 0.00192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.754341721534729, |
| "kl": 0.009378650134749478, |
| "learning_rate": 7.984012459734564e-06, |
| "loss": 0.0673, |
| "num_tokens": 13190202.0, |
| "reward": -0.3472431004047394, |
| "reward_std": 0.5651601552963257, |
| "rewards/rollout_reward_func/mean": -0.3472431004047394, |
| "rewards/rollout_reward_func/std": 0.5896835327148438, |
| "sampling/importance_sampling_ratio/max": 2.230642080307007, |
| "sampling/importance_sampling_ratio/mean": 1.0000313520431519, |
| "sampling/importance_sampling_ratio/min": 0.41313982009887695, |
| "sampling/sampling_logp_difference/max": 0.8839691877365112, |
| "sampling/sampling_logp_difference/mean": 0.011971874162554741, |
| "step": 48, |
| "step_time": 93.4559626040018 |
| }, |
| { |
| "clip_ratio/high_max": 0.006600149557925761, |
| "clip_ratio/high_mean": 0.004353445663582534, |
| "clip_ratio/low_mean": 0.0038080242229625583, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0081614697992336, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9282.0, |
| "completions/max_terminated_length": 9282.0, |
| "completions/mean_length": 7946.375, |
| "completions/mean_terminated_length": 7946.375, |
| "completions/min_length": 2384.0, |
| "completions/min_terminated_length": 2384.0, |
| "entropy": 0.09944085357710719, |
| "epoch": 0.00196, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.2742760181427, |
| "kl": 0.026823249965673313, |
| "learning_rate": 7.981239741308533e-06, |
| "loss": -0.1137, |
| "num_tokens": 13470689.0, |
| "reward": -0.4703686833381653, |
| "reward_std": 0.2635863721370697, |
| "rewards/rollout_reward_func/mean": -0.4703686833381653, |
| "rewards/rollout_reward_func/std": 0.3776912987232208, |
| "sampling/importance_sampling_ratio/max": 1.6952903270721436, |
| "sampling/importance_sampling_ratio/mean": 0.9991170167922974, |
| "sampling/importance_sampling_ratio/min": 0.13879217207431793, |
| "sampling/sampling_logp_difference/max": 1.9747775793075562, |
| "sampling/sampling_logp_difference/mean": 0.012732356786727905, |
| "step": 49, |
| "step_time": 99.44212867799979 |
| }, |
| { |
| "clip_ratio/high_max": 0.009582321741618216, |
| "clip_ratio/high_mean": 0.0051462745177559555, |
| "clip_ratio/low_mean": 0.0014256636786740273, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006571938167326152, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9426.0, |
| "completions/max_terminated_length": 9426.0, |
| "completions/mean_length": 8105.5625, |
| "completions/mean_terminated_length": 8105.5625, |
| "completions/min_length": 5043.0, |
| "completions/min_terminated_length": 5043.0, |
| "entropy": 0.10958141228184104, |
| "epoch": 0.002, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5589848756790161, |
| "kl": 0.010070401827761088, |
| "learning_rate": 7.97824616737322e-06, |
| "loss": 0.0316, |
| "num_tokens": 13757071.0, |
| "reward": -0.37979596853256226, |
| "reward_std": 0.40848225355148315, |
| "rewards/rollout_reward_func/mean": -0.37979596853256226, |
| "rewards/rollout_reward_func/std": 0.4637466371059418, |
| "sampling/importance_sampling_ratio/max": 1.9868782758712769, |
| "sampling/importance_sampling_ratio/mean": 0.9999864101409912, |
| "sampling/importance_sampling_ratio/min": 0.5031964778900146, |
| "sampling/sampling_logp_difference/max": 0.6867746114730835, |
| "sampling/sampling_logp_difference/mean": 0.012691027484834194, |
| "step": 50, |
| "step_time": 97.8083335750016 |
| }, |
| { |
| "clip_ratio/high_max": 0.003072801686357707, |
| "clip_ratio/high_mean": 0.0015364008431788534, |
| "clip_ratio/low_mean": 0.005726226721890271, |
| "clip_ratio/low_min": 0.0007102272938936949, |
| "clip_ratio/region_mean": 0.007262627565069124, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9171.0, |
| "completions/max_terminated_length": 9171.0, |
| "completions/mean_length": 6720.09375, |
| "completions/mean_terminated_length": 6720.09375, |
| "completions/min_length": 279.0, |
| "completions/min_terminated_length": 279.0, |
| "entropy": 0.13046934758313, |
| "epoch": 0.00204, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.509793758392334, |
| "kl": 0.007366802208707668, |
| "learning_rate": 7.975031959697869e-06, |
| "loss": -0.3095, |
| "num_tokens": 13998648.0, |
| "reward": -0.5631532073020935, |
| "reward_std": 0.45109108090400696, |
| "rewards/rollout_reward_func/mean": -0.5631532073020935, |
| "rewards/rollout_reward_func/std": 0.5162220001220703, |
| "sampling/importance_sampling_ratio/max": 2.486898422241211, |
| "sampling/importance_sampling_ratio/mean": 1.0022380352020264, |
| "sampling/importance_sampling_ratio/min": 0.483195036649704, |
| "sampling/sampling_logp_difference/max": 0.9110362529754639, |
| "sampling/sampling_logp_difference/mean": 0.013770891353487968, |
| "step": 51, |
| "step_time": 90.76845354700072 |
| }, |
| { |
| "clip_ratio/high_max": 0.006709904468152672, |
| "clip_ratio/high_mean": 0.003354952234076336, |
| "clip_ratio/low_mean": 0.002561694651376456, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005916646885452792, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9172.0, |
| "completions/max_terminated_length": 9172.0, |
| "completions/mean_length": 6978.65625, |
| "completions/mean_terminated_length": 6978.65625, |
| "completions/min_length": 399.0, |
| "completions/min_terminated_length": 399.0, |
| "entropy": 0.09509467124007642, |
| "epoch": 0.00208, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.6503801345825195, |
| "kl": 0.009442919392313343, |
| "learning_rate": 7.971597356396667e-06, |
| "loss": -0.1007, |
| "num_tokens": 14248771.0, |
| "reward": -0.2623698115348816, |
| "reward_std": 0.4206399917602539, |
| "rewards/rollout_reward_func/mean": -0.2623698115348816, |
| "rewards/rollout_reward_func/std": 0.599825382232666, |
| "sampling/importance_sampling_ratio/max": 2.463897228240967, |
| "sampling/importance_sampling_ratio/mean": 1.0013692378997803, |
| "sampling/importance_sampling_ratio/min": 0.36485153436660767, |
| "sampling/sampling_logp_difference/max": 1.0082647800445557, |
| "sampling/sampling_logp_difference/mean": 0.012561045587062836, |
| "step": 52, |
| "step_time": 88.74248152199925 |
| }, |
| { |
| "clip_ratio/high_max": 0.006392595882061869, |
| "clip_ratio/high_mean": 0.0031962979410309345, |
| "clip_ratio/low_mean": 0.00391515699448064, |
| "clip_ratio/low_min": 0.0014124744920991361, |
| "clip_ratio/region_mean": 0.007111454935511574, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9027.0, |
| "completions/max_terminated_length": 9027.0, |
| "completions/mean_length": 7856.53125, |
| "completions/mean_terminated_length": 7856.53125, |
| "completions/min_length": 1737.0, |
| "completions/min_terminated_length": 1737.0, |
| "entropy": 0.12024440453387797, |
| "epoch": 0.00212, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 2.004279136657715, |
| "kl": 0.010564027619693661, |
| "learning_rate": 7.967942611911098e-06, |
| "loss": 0.0652, |
| "num_tokens": 14527107.0, |
| "reward": -0.3790961503982544, |
| "reward_std": 0.3861127495765686, |
| "rewards/rollout_reward_func/mean": -0.3790961503982544, |
| "rewards/rollout_reward_func/std": 0.5224611759185791, |
| "sampling/importance_sampling_ratio/max": 2.314476251602173, |
| "sampling/importance_sampling_ratio/mean": 0.9991632699966431, |
| "sampling/importance_sampling_ratio/min": 0.4916602075099945, |
| "sampling/sampling_logp_difference/max": 0.8391833305358887, |
| "sampling/sampling_logp_difference/mean": 0.014314696192741394, |
| "step": 53, |
| "step_time": 94.72645873500005 |
| }, |
| { |
| "clip_ratio/high_max": 0.0043270515743643045, |
| "clip_ratio/high_mean": 0.0027531484374776483, |
| "clip_ratio/low_mean": 0.005383253679610789, |
| "clip_ratio/low_min": 0.0014124744920991361, |
| "clip_ratio/region_mean": 0.008136402087984607, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9499.0, |
| "completions/max_terminated_length": 9499.0, |
| "completions/mean_length": 7953.84375, |
| "completions/mean_terminated_length": 7953.84375, |
| "completions/min_length": 2228.0, |
| "completions/min_terminated_length": 2228.0, |
| "entropy": 0.13417695788666606, |
| "epoch": 0.00216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5954021215438843, |
| "kl": 0.011198288608284201, |
| "learning_rate": 7.964067996991091e-06, |
| "loss": 0.0528, |
| "num_tokens": 14808243.0, |
| "reward": -0.40133580565452576, |
| "reward_std": 0.49466052651405334, |
| "rewards/rollout_reward_func/mean": -0.40133580565452576, |
| "rewards/rollout_reward_func/std": 0.5125959515571594, |
| "sampling/importance_sampling_ratio/max": 1.7431554794311523, |
| "sampling/importance_sampling_ratio/mean": 1.0006169080734253, |
| "sampling/importance_sampling_ratio/min": 0.5706778764724731, |
| "sampling/sampling_logp_difference/max": 0.5609303712844849, |
| "sampling/sampling_logp_difference/mean": 0.014247460290789604, |
| "step": 54, |
| "step_time": 94.98353923000104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0049565358203835785, |
| "clip_ratio/high_mean": 0.003176613769028336, |
| "clip_ratio/low_mean": 0.0034777895780280232, |
| "clip_ratio/low_min": 0.0007183908019214869, |
| "clip_ratio/region_mean": 0.006654403347056359, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9193.0, |
| "completions/max_terminated_length": 9193.0, |
| "completions/mean_length": 7506.28125, |
| "completions/mean_terminated_length": 7506.28125, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "entropy": 0.10779642057605088, |
| "epoch": 0.0022, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.364328384399414, |
| "kl": 0.00858347719622543, |
| "learning_rate": 7.95997379867497e-06, |
| "loss": -0.3026, |
| "num_tokens": 15075076.0, |
| "reward": -0.5677193403244019, |
| "reward_std": 0.3086436986923218, |
| "rewards/rollout_reward_func/mean": -0.5677193403244019, |
| "rewards/rollout_reward_func/std": 0.30036547780036926, |
| "sampling/importance_sampling_ratio/max": 1.7251583337783813, |
| "sampling/importance_sampling_ratio/mean": 0.9997882843017578, |
| "sampling/importance_sampling_ratio/min": 0.411167174577713, |
| "sampling/sampling_logp_difference/max": 0.8887554407119751, |
| "sampling/sampling_logp_difference/mean": 0.012287753634154797, |
| "step": 55, |
| "step_time": 92.83784514500621 |
| }, |
| { |
| "clip_ratio/high_max": 0.007216088590212166, |
| "clip_ratio/high_mean": 0.003608044295106083, |
| "clip_ratio/low_mean": 0.0014666151255369186, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005074659449746832, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8876.0, |
| "completions/max_terminated_length": 8876.0, |
| "completions/mean_length": 7641.3125, |
| "completions/mean_terminated_length": 7641.3125, |
| "completions/min_length": 1432.0, |
| "completions/min_terminated_length": 1432.0, |
| "entropy": 0.10442002071067691, |
| "epoch": 0.00224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2740049362182617, |
| "kl": 0.008950471974458196, |
| "learning_rate": 7.955660320268182e-06, |
| "loss": -0.1444, |
| "num_tokens": 15346367.0, |
| "reward": -0.4558144807815552, |
| "reward_std": 0.3483259677886963, |
| "rewards/rollout_reward_func/mean": -0.4558144807815552, |
| "rewards/rollout_reward_func/std": 0.4156731367111206, |
| "sampling/importance_sampling_ratio/max": 1.416229248046875, |
| "sampling/importance_sampling_ratio/mean": 0.999237060546875, |
| "sampling/importance_sampling_ratio/min": 0.46688735485076904, |
| "sampling/sampling_logp_difference/max": 0.7616672515869141, |
| "sampling/sampling_logp_difference/mean": 0.011178133077919483, |
| "step": 56, |
| "step_time": 94.99530417499773 |
| }, |
| { |
| "clip_ratio/high_max": 0.008344250149093568, |
| "clip_ratio/high_mean": 0.005249992886092514, |
| "clip_ratio/low_mean": 0.0018445987370796502, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007094591623172164, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9259.0, |
| "completions/max_terminated_length": 9259.0, |
| "completions/mean_length": 8172.09375, |
| "completions/mean_terminated_length": 8172.09375, |
| "completions/min_length": 1553.0, |
| "completions/min_terminated_length": 1553.0, |
| "entropy": 0.10022557340562344, |
| "epoch": 0.00228, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4207067489624023, |
| "kl": 0.008608459174865857, |
| "learning_rate": 7.951127881320829e-06, |
| "loss": 0.0944, |
| "num_tokens": 15634481.0, |
| "reward": -0.40433624386787415, |
| "reward_std": 0.40806153416633606, |
| "rewards/rollout_reward_func/mean": -0.40433624386787415, |
| "rewards/rollout_reward_func/std": 0.5100822448730469, |
| "sampling/importance_sampling_ratio/max": 2.1468729972839355, |
| "sampling/importance_sampling_ratio/mean": 0.9993807077407837, |
| "sampling/importance_sampling_ratio/min": 0.48448964953422546, |
| "sampling/sampling_logp_difference/max": 0.764012336730957, |
| "sampling/sampling_logp_difference/mean": 0.011580238118767738, |
| "step": 57, |
| "step_time": 95.68224068699783 |
| }, |
| { |
| "clip_ratio/high_max": 0.005006112158298492, |
| "clip_ratio/high_mean": 0.002503056079149246, |
| "clip_ratio/low_mean": 0.0023096000077202916, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.004812656086869538, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9625.0, |
| "completions/max_terminated_length": 9625.0, |
| "completions/mean_length": 7705.75, |
| "completions/mean_terminated_length": 7705.75, |
| "completions/min_length": 822.0, |
| "completions/min_terminated_length": 822.0, |
| "entropy": 0.10624475823715329, |
| "epoch": 0.00232, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3904070854187012, |
| "kl": 0.013951810746220872, |
| "learning_rate": 7.946376817604e-06, |
| "loss": -0.0914, |
| "num_tokens": 15907912.0, |
| "reward": -0.5049029588699341, |
| "reward_std": 0.3845633864402771, |
| "rewards/rollout_reward_func/mean": -0.5049029588699341, |
| "rewards/rollout_reward_func/std": 0.46199285984039307, |
| "sampling/importance_sampling_ratio/max": 1.798954963684082, |
| "sampling/importance_sampling_ratio/mean": 1.0002611875534058, |
| "sampling/importance_sampling_ratio/min": 0.46839648485183716, |
| "sampling/sampling_logp_difference/max": 0.7584401965141296, |
| "sampling/sampling_logp_difference/mean": 0.0127301886677742, |
| "step": 58, |
| "step_time": 93.91949066700363 |
| }, |
| { |
| "clip_ratio/high_max": 0.00436182162957266, |
| "clip_ratio/high_mean": 0.00218091081478633, |
| "clip_ratio/low_mean": 0.003825479419901967, |
| "clip_ratio/low_min": 0.0014204545877873898, |
| "clip_ratio/region_mean": 0.006006390263792127, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9536.0, |
| "completions/max_terminated_length": 9536.0, |
| "completions/mean_length": 8050.53125, |
| "completions/mean_terminated_length": 8050.53125, |
| "completions/min_length": 894.0, |
| "completions/min_terminated_length": 894.0, |
| "entropy": 0.11231644381769001, |
| "epoch": 0.00236, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3271838426589966, |
| "kl": 0.007607599673065124, |
| "learning_rate": 7.941407481084896e-06, |
| "loss": -0.1529, |
| "num_tokens": 16191978.0, |
| "reward": -0.5152639150619507, |
| "reward_std": 0.3767213821411133, |
| "rewards/rollout_reward_func/mean": -0.5152639150619507, |
| "rewards/rollout_reward_func/std": 0.4134299159049988, |
| "sampling/importance_sampling_ratio/max": 1.7963619232177734, |
| "sampling/importance_sampling_ratio/mean": 0.9979240894317627, |
| "sampling/importance_sampling_ratio/min": 0.4781153202056885, |
| "sampling/sampling_logp_difference/max": 0.737903356552124, |
| "sampling/sampling_logp_difference/mean": 0.012326521798968315, |
| "step": 59, |
| "step_time": 96.3539147010033 |
| }, |
| { |
| "clip_ratio/high_max": 0.008266003103926778, |
| "clip_ratio/high_mean": 0.004488115198910236, |
| "clip_ratio/low_mean": 0.004311547527322546, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.008799662697128952, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9246.0, |
| "completions/max_terminated_length": 9246.0, |
| "completions/mean_length": 8110.9375, |
| "completions/mean_terminated_length": 8110.9375, |
| "completions/min_length": 3902.0, |
| "completions/min_terminated_length": 3902.0, |
| "entropy": 0.12428534729406238, |
| "epoch": 0.0024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.9092047214508057, |
| "kl": 0.012073749028786551, |
| "learning_rate": 7.936220239900746e-06, |
| "loss": -0.0357, |
| "num_tokens": 16478068.0, |
| "reward": -0.4510955512523651, |
| "reward_std": 0.31745630502700806, |
| "rewards/rollout_reward_func/mean": -0.4510955512523651, |
| "rewards/rollout_reward_func/std": 0.37336239218711853, |
| "sampling/importance_sampling_ratio/max": 2.0322628021240234, |
| "sampling/importance_sampling_ratio/mean": 0.9984027147293091, |
| "sampling/importance_sampling_ratio/min": 0.535262942314148, |
| "sampling/sampling_logp_difference/max": 0.7091498374938965, |
| "sampling/sampling_logp_difference/mean": 0.015058638527989388, |
| "step": 60, |
| "step_time": 95.91656595599852 |
| }, |
| { |
| "clip_ratio/high_max": 0.0013888889225199819, |
| "clip_ratio/high_mean": 0.0006944444612599909, |
| "clip_ratio/low_mean": 0.0022155048209242523, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002909949282184243, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9154.0, |
| "completions/max_terminated_length": 9154.0, |
| "completions/mean_length": 7612.8125, |
| "completions/mean_terminated_length": 7612.8125, |
| "completions/min_length": 1984.0, |
| "completions/min_terminated_length": 1984.0, |
| "entropy": 0.10300399106927216, |
| "epoch": 0.00244, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.391798973083496, |
| "kl": 0.004920201638014987, |
| "learning_rate": 7.930815478331545e-06, |
| "loss": -0.1339, |
| "num_tokens": 16748102.0, |
| "reward": -0.4280434846878052, |
| "reward_std": 0.4262932240962982, |
| "rewards/rollout_reward_func/mean": -0.4280434846878052, |
| "rewards/rollout_reward_func/std": 0.5089011192321777, |
| "sampling/importance_sampling_ratio/max": 1.7586421966552734, |
| "sampling/importance_sampling_ratio/mean": 1.000064730644226, |
| "sampling/importance_sampling_ratio/min": 0.37761256098747253, |
| "sampling/sampling_logp_difference/max": 0.9738866090774536, |
| "sampling/sampling_logp_difference/mean": 0.011382215656340122, |
| "step": 61, |
| "step_time": 92.25858828300079 |
| }, |
| { |
| "clip_ratio/high_max": 0.007856509007979184, |
| "clip_ratio/high_mean": 0.003928254503989592, |
| "clip_ratio/low_mean": 0.0018972224788740277, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00582547701196745, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9187.0, |
| "completions/max_terminated_length": 9187.0, |
| "completions/mean_length": 7976.46875, |
| "completions/mean_terminated_length": 7976.46875, |
| "completions/min_length": 3602.0, |
| "completions/min_terminated_length": 3602.0, |
| "entropy": 0.11776942946016788, |
| "epoch": 0.00248, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3773763179779053, |
| "kl": 0.006828719437180553, |
| "learning_rate": 7.925193596771585e-06, |
| "loss": -0.1787, |
| "num_tokens": 17030139.0, |
| "reward": -0.4070221781730652, |
| "reward_std": 0.4106467366218567, |
| "rewards/rollout_reward_func/mean": -0.4070221781730652, |
| "rewards/rollout_reward_func/std": 0.44072529673576355, |
| "sampling/importance_sampling_ratio/max": 1.859816312789917, |
| "sampling/importance_sampling_ratio/mean": 0.9994407296180725, |
| "sampling/importance_sampling_ratio/min": 0.4592384099960327, |
| "sampling/sampling_logp_difference/max": 0.7781858444213867, |
| "sampling/sampling_logp_difference/mean": 0.011814535595476627, |
| "step": 62, |
| "step_time": 92.16294349900272 |
| }, |
| { |
| "clip_ratio/high_max": 0.0035953749902546406, |
| "clip_ratio/high_mean": 0.0017976874951273203, |
| "clip_ratio/low_mean": 0.0022025909856893122, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004000278509920463, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9030.0, |
| "completions/max_terminated_length": 9030.0, |
| "completions/mean_length": 8136.34375, |
| "completions/mean_terminated_length": 8136.34375, |
| "completions/min_length": 2270.0, |
| "completions/min_terminated_length": 2270.0, |
| "entropy": 0.09144307160750031, |
| "epoch": 0.00252, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1514160633087158, |
| "kl": 0.00797528739349218, |
| "learning_rate": 7.919355011699786e-06, |
| "loss": -0.0769, |
| "num_tokens": 17316894.0, |
| "reward": -0.4627407193183899, |
| "reward_std": 0.3360978960990906, |
| "rewards/rollout_reward_func/mean": -0.4627407193183899, |
| "rewards/rollout_reward_func/std": 0.35081133246421814, |
| "sampling/importance_sampling_ratio/max": 1.6952903270721436, |
| "sampling/importance_sampling_ratio/mean": 0.9994579553604126, |
| "sampling/importance_sampling_ratio/min": 0.4881785213947296, |
| "sampling/sampling_logp_difference/max": 0.7170741558074951, |
| "sampling/sampling_logp_difference/mean": 0.010036073625087738, |
| "step": 63, |
| "step_time": 96.17660570699991 |
| }, |
| { |
| "clip_ratio/high_max": 0.002883423527237028, |
| "clip_ratio/high_mean": 0.001441711763618514, |
| "clip_ratio/low_mean": 0.00249927889672108, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003940990660339594, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9163.0, |
| "completions/max_terminated_length": 9163.0, |
| "completions/mean_length": 7561.375, |
| "completions/mean_terminated_length": 7561.375, |
| "completions/min_length": 3059.0, |
| "completions/min_terminated_length": 3059.0, |
| "entropy": 0.09984453371725976, |
| "epoch": 0.00256, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3233143091201782, |
| "kl": 0.00887665447044128, |
| "learning_rate": 7.913300155648851e-06, |
| "loss": 0.0311, |
| "num_tokens": 17585042.0, |
| "reward": -0.3819345235824585, |
| "reward_std": 0.46788647770881653, |
| "rewards/rollout_reward_func/mean": -0.3819345235824585, |
| "rewards/rollout_reward_func/std": 0.5240892171859741, |
| "sampling/importance_sampling_ratio/max": 1.9474657773971558, |
| "sampling/importance_sampling_ratio/mean": 0.9996236562728882, |
| "sampling/importance_sampling_ratio/min": 0.537067711353302, |
| "sampling/sampling_logp_difference/max": 0.6665289402008057, |
| "sampling/sampling_logp_difference/mean": 0.010441070422530174, |
| "step": 64, |
| "step_time": 92.7075216550038 |
| }, |
| { |
| "clip_ratio/high_max": 0.009018870943691581, |
| "clip_ratio/high_mean": 0.004860559041844681, |
| "clip_ratio/low_mean": 0.001825410407036543, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006685969448881224, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9200.0, |
| "completions/max_terminated_length": 9200.0, |
| "completions/mean_length": 7343.84375, |
| "completions/mean_terminated_length": 7343.84375, |
| "completions/min_length": 1590.0, |
| "completions/min_terminated_length": 1590.0, |
| "entropy": 0.11347688268870115, |
| "epoch": 0.0026, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4888594150543213, |
| "kl": 0.01094079991162289, |
| "learning_rate": 7.907029477173219e-06, |
| "loss": 0.0067, |
| "num_tokens": 17846223.0, |
| "reward": -0.3279417157173157, |
| "reward_std": 0.5277228951454163, |
| "rewards/rollout_reward_func/mean": -0.3279417157173157, |
| "rewards/rollout_reward_func/std": 0.6081415414810181, |
| "sampling/importance_sampling_ratio/max": 1.5022884607315063, |
| "sampling/importance_sampling_ratio/mean": 0.9989460706710815, |
| "sampling/importance_sampling_ratio/min": 0.5411266088485718, |
| "sampling/sampling_logp_difference/max": 0.6141020059585571, |
| "sampling/sampling_logp_difference/mean": 0.011345047503709793, |
| "step": 65, |
| "step_time": 90.58205723999708 |
| }, |
| { |
| "clip_ratio/high_max": 0.004411768633872271, |
| "clip_ratio/high_mean": 0.002557007916038856, |
| "clip_ratio/low_mean": 0.0017878134967759252, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004344821412814781, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9217.0, |
| "completions/max_terminated_length": 9217.0, |
| "completions/mean_length": 7967.03125, |
| "completions/mean_terminated_length": 7967.03125, |
| "completions/min_length": 2804.0, |
| "completions/min_terminated_length": 2804.0, |
| "entropy": 0.1117274472489953, |
| "epoch": 0.00264, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3865375518798828, |
| "kl": 0.009410161008418072, |
| "learning_rate": 7.900543440815832e-06, |
| "loss": -0.0278, |
| "num_tokens": 18127517.0, |
| "reward": -0.3698572516441345, |
| "reward_std": 0.43620961904525757, |
| "rewards/rollout_reward_func/mean": -0.3698572516441345, |
| "rewards/rollout_reward_func/std": 0.48424720764160156, |
| "sampling/importance_sampling_ratio/max": 2.5134615898132324, |
| "sampling/importance_sampling_ratio/mean": 0.999183714389801, |
| "sampling/importance_sampling_ratio/min": 0.4420686364173889, |
| "sampling/sampling_logp_difference/max": 0.9216609001159668, |
| "sampling/sampling_logp_difference/mean": 0.01282300055027008, |
| "step": 66, |
| "step_time": 95.30812872799834 |
| }, |
| { |
| "clip_ratio/high_max": 0.0057083725114353, |
| "clip_ratio/high_mean": 0.00285418625571765, |
| "clip_ratio/low_mean": 0.002307449496584013, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005161635781405494, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9051.0, |
| "completions/max_terminated_length": 9051.0, |
| "completions/mean_length": 7223.84375, |
| "completions/mean_terminated_length": 7223.84375, |
| "completions/min_length": 196.0, |
| "completions/min_terminated_length": 196.0, |
| "entropy": 0.10340581508353353, |
| "epoch": 0.00268, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1929662227630615, |
| "kl": 0.004904634231934324, |
| "learning_rate": 7.89384252707373e-06, |
| "loss": -0.3888, |
| "num_tokens": 18385190.0, |
| "reward": -0.48364531993865967, |
| "reward_std": 0.3940197825431824, |
| "rewards/rollout_reward_func/mean": -0.48364531993865967, |
| "rewards/rollout_reward_func/std": 0.428783118724823, |
| "sampling/importance_sampling_ratio/max": 1.5800230503082275, |
| "sampling/importance_sampling_ratio/mean": 0.9995721578598022, |
| "sampling/importance_sampling_ratio/min": 0.3921872675418854, |
| "sampling/sampling_logp_difference/max": 0.9360158443450928, |
| "sampling/sampling_logp_difference/mean": 0.010175148025155067, |
| "step": 67, |
| "step_time": 91.31019782999829 |
| }, |
| { |
| "clip_ratio/high_max": 0.009525597095489502, |
| "clip_ratio/high_mean": 0.005110020807478577, |
| "clip_ratio/low_mean": 0.002821325935656205, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00793134668492712, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9310.0, |
| "completions/max_terminated_length": 9310.0, |
| "completions/mean_length": 7807.96875, |
| "completions/mean_terminated_length": 7807.96875, |
| "completions/min_length": 1538.0, |
| "completions/min_terminated_length": 1538.0, |
| "entropy": 0.11348246154375374, |
| "epoch": 0.00272, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4794671535491943, |
| "kl": 0.009939985982782673, |
| "learning_rate": 7.886927232362445e-06, |
| "loss": 0.1917, |
| "num_tokens": 18661453.0, |
| "reward": -0.29462701082229614, |
| "reward_std": 0.5272692441940308, |
| "rewards/rollout_reward_func/mean": -0.29462701082229614, |
| "rewards/rollout_reward_func/std": 0.5638213753700256, |
| "sampling/importance_sampling_ratio/max": 2.0398762226104736, |
| "sampling/importance_sampling_ratio/mean": 1.0006808042526245, |
| "sampling/importance_sampling_ratio/min": 0.3978581726551056, |
| "sampling/sampling_logp_difference/max": 0.9216597080230713, |
| "sampling/sampling_logp_difference/mean": 0.013158271089196205, |
| "step": 68, |
| "step_time": 94.51114526800302 |
| }, |
| { |
| "clip_ratio/high_max": 0.0028011860558763146, |
| "clip_ratio/high_mean": 0.0017478152585681528, |
| "clip_ratio/low_mean": 0.0007023358775768429, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0024501511361449957, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9273.0, |
| "completions/max_terminated_length": 9273.0, |
| "completions/mean_length": 8334.46875, |
| "completions/mean_terminated_length": 8334.46875, |
| "completions/min_length": 2627.0, |
| "completions/min_terminated_length": 2627.0, |
| "entropy": 0.10280115297064185, |
| "epoch": 0.00276, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4324398040771484, |
| "kl": 0.007807943955413066, |
| "learning_rate": 7.879798068979234e-06, |
| "loss": 0.0179, |
| "num_tokens": 18954599.0, |
| "reward": -0.3730016350746155, |
| "reward_std": 0.33994197845458984, |
| "rewards/rollout_reward_func/mean": -0.3730016350746155, |
| "rewards/rollout_reward_func/std": 0.4095904231071472, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.00093674659729, |
| "sampling/importance_sampling_ratio/min": 0.49454185366630554, |
| "sampling/sampling_logp_difference/max": 1.1368892192840576, |
| "sampling/sampling_logp_difference/mean": 0.010166226886212826, |
| "step": 69, |
| "step_time": 95.59550246900108 |
| }, |
| { |
| "clip_ratio/high_max": 0.004303681547753513, |
| "clip_ratio/high_mean": 0.0021518407738767564, |
| "clip_ratio/low_mean": 0.0021395922813098878, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004291433084290475, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9158.0, |
| "completions/max_terminated_length": 9158.0, |
| "completions/mean_length": 7782.03125, |
| "completions/mean_terminated_length": 7782.03125, |
| "completions/min_length": 1927.0, |
| "completions/min_terminated_length": 1927.0, |
| "entropy": 0.09547213604673743, |
| "epoch": 0.0028, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1895850896835327, |
| "kl": 0.005259023229882587, |
| "learning_rate": 7.872455565065118e-06, |
| "loss": -0.1185, |
| "num_tokens": 19230041.0, |
| "reward": -0.4593248963356018, |
| "reward_std": 0.37924250960350037, |
| "rewards/rollout_reward_func/mean": -0.4593248963356018, |
| "rewards/rollout_reward_func/std": 0.39817455410957336, |
| "sampling/importance_sampling_ratio/max": 1.7586390972137451, |
| "sampling/importance_sampling_ratio/mean": 0.9991085529327393, |
| "sampling/importance_sampling_ratio/min": 0.4861142635345459, |
| "sampling/sampling_logp_difference/max": 0.7213115692138672, |
| "sampling/sampling_logp_difference/mean": 0.01034074928611517, |
| "step": 70, |
| "step_time": 92.33433518700258 |
| }, |
| { |
| "clip_ratio/high_max": 0.00991362234344706, |
| "clip_ratio/high_mean": 0.0060480811225716025, |
| "clip_ratio/low_mean": 0.0010794433765113354, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007127524469979107, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9219.0, |
| "completions/max_terminated_length": 9219.0, |
| "completions/mean_length": 7936.96875, |
| "completions/mean_terminated_length": 7936.96875, |
| "completions/min_length": 307.0, |
| "completions/min_terminated_length": 307.0, |
| "entropy": 0.11046935408376157, |
| "epoch": 0.00284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2573769092559814, |
| "kl": 0.00650980025602621, |
| "learning_rate": 7.864900264565765e-06, |
| "loss": -0.0379, |
| "num_tokens": 19509951.0, |
| "reward": -0.520825982093811, |
| "reward_std": 0.35236403346061707, |
| "rewards/rollout_reward_func/mean": -0.520825982093811, |
| "rewards/rollout_reward_func/std": 0.42140084505081177, |
| "sampling/importance_sampling_ratio/max": 1.9192090034484863, |
| "sampling/importance_sampling_ratio/mean": 0.9996625185012817, |
| "sampling/importance_sampling_ratio/min": 0.4776991903781891, |
| "sampling/sampling_logp_difference/max": 0.7387740612030029, |
| "sampling/sampling_logp_difference/mean": 0.010734163224697113, |
| "step": 71, |
| "step_time": 93.32257420199858 |
| }, |
| { |
| "clip_ratio/high_max": 0.00632950384169817, |
| "clip_ratio/high_mean": 0.004238442983478308, |
| "clip_ratio/low_mean": 0.001788180525181815, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006026623508660123, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9523.0, |
| "completions/max_terminated_length": 9523.0, |
| "completions/mean_length": 8397.84375, |
| "completions/mean_terminated_length": 8397.84375, |
| "completions/min_length": 2433.0, |
| "completions/min_terminated_length": 2433.0, |
| "entropy": 0.11352175939828157, |
| "epoch": 0.00288, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4297624826431274, |
| "kl": 0.005914928762649652, |
| "learning_rate": 7.857132727191193e-06, |
| "loss": 0.0541, |
| "num_tokens": 19805153.0, |
| "reward": -0.3045397400856018, |
| "reward_std": 0.33476388454437256, |
| "rewards/rollout_reward_func/mean": -0.3045397400856018, |
| "rewards/rollout_reward_func/std": 0.4641619026660919, |
| "sampling/importance_sampling_ratio/max": 1.5166332721710205, |
| "sampling/importance_sampling_ratio/mean": 1.000412940979004, |
| "sampling/importance_sampling_ratio/min": 0.4192443788051605, |
| "sampling/sampling_logp_difference/max": 0.8693013191223145, |
| "sampling/sampling_logp_difference/mean": 0.010624254122376442, |
| "step": 72, |
| "step_time": 96.8673131069936 |
| }, |
| { |
| "clip_ratio/high_max": 0.006114824442192912, |
| "clip_ratio/high_mean": 0.003057412221096456, |
| "clip_ratio/low_mean": 0.0060244997730478644, |
| "clip_ratio/low_min": 0.002430555585306138, |
| "clip_ratio/region_mean": 0.00908191199414432, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9157.0, |
| "completions/max_terminated_length": 9157.0, |
| "completions/mean_length": 7370.25, |
| "completions/mean_terminated_length": 7370.25, |
| "completions/min_length": 1337.0, |
| "completions/min_terminated_length": 1337.0, |
| "entropy": 0.12441039457917213, |
| "epoch": 0.00292, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3885090351104736, |
| "kl": 0.014640487075666897, |
| "learning_rate": 7.849153528374295e-06, |
| "loss": -0.1697, |
| "num_tokens": 20067395.0, |
| "reward": -0.5984026789665222, |
| "reward_std": 0.3406347632408142, |
| "rewards/rollout_reward_func/mean": -0.5984026789665222, |
| "rewards/rollout_reward_func/std": 0.4121968448162079, |
| "sampling/importance_sampling_ratio/max": 1.8335965871810913, |
| "sampling/importance_sampling_ratio/mean": 1.000123381614685, |
| "sampling/importance_sampling_ratio/min": 0.560712456703186, |
| "sampling/sampling_logp_difference/max": 0.6062793731689453, |
| "sampling/sampling_logp_difference/mean": 0.013483337126672268, |
| "step": 73, |
| "step_time": 89.83431924199795 |
| }, |
| { |
| "clip_ratio/high_max": 0.007186023925896734, |
| "clip_ratio/high_mean": 0.00395220736390911, |
| "clip_ratio/low_mean": 0.0026030205481220037, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006555227912031114, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9158.0, |
| "completions/max_terminated_length": 9158.0, |
| "completions/mean_length": 7705.6875, |
| "completions/mean_terminated_length": 7705.6875, |
| "completions/min_length": 578.0, |
| "completions/min_terminated_length": 578.0, |
| "entropy": 0.10846898006275296, |
| "epoch": 0.00296, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3326221704483032, |
| "kl": 0.008529015118256211, |
| "learning_rate": 7.840963259228225e-06, |
| "loss": -0.1738, |
| "num_tokens": 20340257.0, |
| "reward": -0.4566282629966736, |
| "reward_std": 0.36557790637016296, |
| "rewards/rollout_reward_func/mean": -0.4566282629966736, |
| "rewards/rollout_reward_func/std": 0.39172062277793884, |
| "sampling/importance_sampling_ratio/max": 1.7097969055175781, |
| "sampling/importance_sampling_ratio/mean": 0.9991310238838196, |
| "sampling/importance_sampling_ratio/min": 0.39339712262153625, |
| "sampling/sampling_logp_difference/max": 0.9329357147216797, |
| "sampling/sampling_logp_difference/mean": 0.011443836614489555, |
| "step": 74, |
| "step_time": 94.07717578700249 |
| }, |
| { |
| "clip_ratio/high_max": 0.0028093435103073716, |
| "clip_ratio/high_mean": 0.0014046717551536858, |
| "clip_ratio/low_mean": 0.0007022471982054412, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002106918953359127, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9250.0, |
| "completions/max_terminated_length": 9250.0, |
| "completions/mean_length": 7632.8125, |
| "completions/mean_terminated_length": 7632.8125, |
| "completions/min_length": 414.0, |
| "completions/min_terminated_length": 414.0, |
| "entropy": 0.10531778959557414, |
| "epoch": 0.003, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.4729112386703491, |
| "kl": 0.009549812879413366, |
| "learning_rate": 7.832562526502598e-06, |
| "loss": -0.1487, |
| "num_tokens": 20610590.0, |
| "reward": -0.5081942677497864, |
| "reward_std": 0.3885437250137329, |
| "rewards/rollout_reward_func/mean": -0.5081942677497864, |
| "rewards/rollout_reward_func/std": 0.4038866460323334, |
| "sampling/importance_sampling_ratio/max": 2.1431355476379395, |
| "sampling/importance_sampling_ratio/mean": 1.0006523132324219, |
| "sampling/importance_sampling_ratio/min": 0.5124464631080627, |
| "sampling/sampling_logp_difference/max": 0.7622699737548828, |
| "sampling/sampling_logp_difference/mean": 0.010758567601442337, |
| "step": 75, |
| "step_time": 91.45560667300197 |
| }, |
| { |
| "clip_ratio/high_max": 0.004320964915677905, |
| "clip_ratio/high_mean": 0.0025281294947490096, |
| "clip_ratio/low_mean": 0.002119072509231046, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0046472020039800555, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9478.0, |
| "completions/max_terminated_length": 9478.0, |
| "completions/mean_length": 7664.84375, |
| "completions/mean_terminated_length": 7664.84375, |
| "completions/min_length": 290.0, |
| "completions/min_terminated_length": 290.0, |
| "entropy": 0.1210014394018799, |
| "epoch": 0.00304, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.333920955657959, |
| "kl": 0.008828506610370823, |
| "learning_rate": 7.82395195253854e-06, |
| "loss": -0.0473, |
| "num_tokens": 20882278.0, |
| "reward": -0.33376026153564453, |
| "reward_std": 0.5433810949325562, |
| "rewards/rollout_reward_func/mean": -0.33376026153564453, |
| "rewards/rollout_reward_func/std": 0.5932837724685669, |
| "sampling/importance_sampling_ratio/max": 1.7989699840545654, |
| "sampling/importance_sampling_ratio/mean": 0.9988389015197754, |
| "sampling/importance_sampling_ratio/min": 0.5031105279922485, |
| "sampling/sampling_logp_difference/max": 0.686945378780365, |
| "sampling/sampling_logp_difference/mean": 0.01223824918270111, |
| "step": 76, |
| "step_time": 93.80719368800237 |
| }, |
| { |
| "clip_ratio/high_max": 0.004671525151934475, |
| "clip_ratio/high_mean": 0.0023357625759672374, |
| "clip_ratio/low_mean": 0.0031846001220401376, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.0055203626689035445, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 8957.0, |
| "completions/max_terminated_length": 8957.0, |
| "completions/mean_length": 7233.0625, |
| "completions/mean_terminated_length": 7233.0625, |
| "completions/min_length": 199.0, |
| "completions/min_terminated_length": 199.0, |
| "entropy": 0.11074003390967846, |
| "epoch": 0.00308, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.5347291231155396, |
| "kl": 0.00865966931451112, |
| "learning_rate": 7.815132175222592e-06, |
| "loss": -0.1913, |
| "num_tokens": 21139607.0, |
| "reward": -0.4299502968788147, |
| "reward_std": 0.5717419385910034, |
| "rewards/rollout_reward_func/mean": -0.4299502968788147, |
| "rewards/rollout_reward_func/std": 0.5658487677574158, |
| "sampling/importance_sampling_ratio/max": 3.0, |
| "sampling/importance_sampling_ratio/mean": 1.000859260559082, |
| "sampling/importance_sampling_ratio/min": 0.5086528062820435, |
| "sampling/sampling_logp_difference/max": 1.1856822967529297, |
| "sampling/sampling_logp_difference/mean": 0.011567970737814903, |
| "step": 77, |
| "step_time": 88.42593535700325 |
| }, |
| { |
| "clip_ratio/high_max": 0.0021392186754383147, |
| "clip_ratio/high_mean": 0.0010696093377191573, |
| "clip_ratio/low_mean": 0.00273930755793117, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.0038089169247541577, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9201.0, |
| "completions/max_terminated_length": 9201.0, |
| "completions/mean_length": 7827.09375, |
| "completions/mean_terminated_length": 7827.09375, |
| "completions/min_length": 1987.0, |
| "completions/min_terminated_length": 1987.0, |
| "entropy": 0.10664277244359255, |
| "epoch": 0.00312, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2181073427200317, |
| "kl": 0.006914699915796518, |
| "learning_rate": 7.806103847939445e-06, |
| "loss": -0.1403, |
| "num_tokens": 21416098.0, |
| "reward": -0.44013938307762146, |
| "reward_std": 0.407479465007782, |
| "rewards/rollout_reward_func/mean": -0.44013938307762146, |
| "rewards/rollout_reward_func/std": 0.44036611914634705, |
| "sampling/importance_sampling_ratio/max": 2.069561004638672, |
| "sampling/importance_sampling_ratio/mean": 1.0012619495391846, |
| "sampling/importance_sampling_ratio/min": 0.6060473918914795, |
| "sampling/sampling_logp_difference/max": 0.7273365259170532, |
| "sampling/sampling_logp_difference/mean": 0.010566653683781624, |
| "step": 78, |
| "step_time": 92.46403468800418 |
| }, |
| { |
| "clip_ratio/high_max": 0.004304705187678337, |
| "clip_ratio/high_mean": 0.0021523525938391685, |
| "clip_ratio/low_mean": 0.00035511364694684744, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002507466240786016, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9566.0, |
| "completions/max_terminated_length": 9566.0, |
| "completions/mean_length": 6899.25, |
| "completions/mean_terminated_length": 6899.25, |
| "completions/min_length": 950.0, |
| "completions/min_terminated_length": 950.0, |
| "entropy": 0.1280936081893742, |
| "epoch": 0.00316, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.282666802406311, |
| "kl": 0.01210544134664815, |
| "learning_rate": 7.79686763952354e-06, |
| "loss": -0.3126, |
| "num_tokens": 21663152.0, |
| "reward": -0.4885612428188324, |
| "reward_std": 0.42654168605804443, |
| "rewards/rollout_reward_func/mean": -0.4885612428188324, |
| "rewards/rollout_reward_func/std": 0.4574175775051117, |
| "sampling/importance_sampling_ratio/max": 1.6807042360305786, |
| "sampling/importance_sampling_ratio/mean": 0.9999642372131348, |
| "sampling/importance_sampling_ratio/min": 0.6889766454696655, |
| "sampling/sampling_logp_difference/max": 0.5192129611968994, |
| "sampling/sampling_logp_difference/mean": 0.012327692471444607, |
| "step": 79, |
| "step_time": 86.57649834300173 |
| }, |
| { |
| "clip_ratio/high_max": 0.005642829288262874, |
| "clip_ratio/high_mean": 0.002821414644131437, |
| "clip_ratio/low_mean": 0.0033451107738073915, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006166525447042659, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9229.0, |
| "completions/max_terminated_length": 9229.0, |
| "completions/mean_length": 7748.8125, |
| "completions/mean_terminated_length": 7748.8125, |
| "completions/min_length": 959.0, |
| "completions/min_terminated_length": 959.0, |
| "entropy": 0.11266007972881198, |
| "epoch": 0.0032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2385865449905396, |
| "kl": 0.009715210646390915, |
| "learning_rate": 7.787424234209523e-06, |
| "loss": -0.2138, |
| "num_tokens": 21937291.0, |
| "reward": -0.5135730504989624, |
| "reward_std": 0.28290629386901855, |
| "rewards/rollout_reward_func/mean": -0.5135730504989624, |
| "rewards/rollout_reward_func/std": 0.29492101073265076, |
| "sampling/importance_sampling_ratio/max": 1.8985683917999268, |
| "sampling/importance_sampling_ratio/mean": 1.0005488395690918, |
| "sampling/importance_sampling_ratio/min": 0.21555323898792267, |
| "sampling/sampling_logp_difference/max": 1.5345473289489746, |
| "sampling/sampling_logp_difference/mean": 0.012439057230949402, |
| "step": 80, |
| "step_time": 94.79033007999533 |
| }, |
| { |
| "clip_ratio/high_max": 0.004234770254697651, |
| "clip_ratio/high_mean": 0.002476580470101908, |
| "clip_ratio/low_mean": 0.0045510306663345546, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.007027611136436462, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9216.0, |
| "completions/max_terminated_length": 9216.0, |
| "completions/mean_length": 7755.90625, |
| "completions/mean_terminated_length": 7755.90625, |
| "completions/min_length": 1219.0, |
| "completions/min_terminated_length": 1219.0, |
| "entropy": 0.10599858709610999, |
| "epoch": 0.00324, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1525481939315796, |
| "kl": 0.0123361352016218, |
| "learning_rate": 7.77777433158155e-06, |
| "loss": -0.1473, |
| "num_tokens": 22211636.0, |
| "reward": -0.4603438377380371, |
| "reward_std": 0.3456616699695587, |
| "rewards/rollout_reward_func/mean": -0.4603438377380371, |
| "rewards/rollout_reward_func/std": 0.43396344780921936, |
| "sampling/importance_sampling_ratio/max": 2.0103213787078857, |
| "sampling/importance_sampling_ratio/mean": 0.999226450920105, |
| "sampling/importance_sampling_ratio/min": 0.6042999029159546, |
| "sampling/sampling_logp_difference/max": 0.6982946395874023, |
| "sampling/sampling_logp_difference/mean": 0.01090179942548275, |
| "step": 81, |
| "step_time": 91.66288167299899 |
| }, |
| { |
| "clip_ratio/high_max": 0.006423490063752979, |
| "clip_ratio/high_mean": 0.0035628686600830406, |
| "clip_ratio/low_mean": 0.002858268067939207, |
| "clip_ratio/low_min": 0.0014044943964108825, |
| "clip_ratio/region_mean": 0.0064211367862299085, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9370.0, |
| "completions/max_terminated_length": 9370.0, |
| "completions/mean_length": 7690.96875, |
| "completions/mean_terminated_length": 7690.96875, |
| "completions/min_length": 655.0, |
| "completions/min_terminated_length": 655.0, |
| "entropy": 0.12091189902275801, |
| "epoch": 0.00328, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2017008066177368, |
| "kl": 0.011158690598676912, |
| "learning_rate": 7.767918646521461e-06, |
| "loss": -0.0613, |
| "num_tokens": 22483765.0, |
| "reward": -0.3411584496498108, |
| "reward_std": 0.49997615814208984, |
| "rewards/rollout_reward_func/mean": -0.3411584496498108, |
| "rewards/rollout_reward_func/std": 0.5335732698440552, |
| "sampling/importance_sampling_ratio/max": 1.4986999034881592, |
| "sampling/importance_sampling_ratio/mean": 0.9999135732650757, |
| "sampling/importance_sampling_ratio/min": 0.37155619263648987, |
| "sampling/sampling_logp_difference/max": 0.9900552034378052, |
| "sampling/sampling_logp_difference/mean": 0.010972358286380768, |
| "step": 82, |
| "step_time": 95.45822799199959 |
| }, |
| { |
| "clip_ratio/high_max": 0.005522000603377819, |
| "clip_ratio/high_mean": 0.0027610003016889095, |
| "clip_ratio/low_mean": 0.0028991997824050486, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005660199996782467, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9288.0, |
| "completions/max_terminated_length": 9288.0, |
| "completions/mean_length": 7861.5, |
| "completions/mean_terminated_length": 7861.5, |
| "completions/min_length": 300.0, |
| "completions/min_terminated_length": 300.0, |
| "entropy": 0.12187603581696749, |
| "epoch": 0.00332, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2622228860855103, |
| "kl": 0.014604453899664804, |
| "learning_rate": 7.75785790915583e-06, |
| "loss": 0.0668, |
| "num_tokens": 22761866.0, |
| "reward": -0.3003544807434082, |
| "reward_std": 0.6722631454467773, |
| "rewards/rollout_reward_func/mean": -0.3003544807434082, |
| "rewards/rollout_reward_func/std": 0.6533814668655396, |
| "sampling/importance_sampling_ratio/max": 1.6991353034973145, |
| "sampling/importance_sampling_ratio/mean": 0.9996775984764099, |
| "sampling/importance_sampling_ratio/min": 0.07604925334453583, |
| "sampling/sampling_logp_difference/max": 2.576374053955078, |
| "sampling/sampling_logp_difference/mean": 0.01398524735122919, |
| "step": 83, |
| "step_time": 93.20210950300316 |
| }, |
| { |
| "clip_ratio/high_max": 0.00501747039379552, |
| "clip_ratio/high_mean": 0.00250873519689776, |
| "clip_ratio/low_mean": 0.0032708367507439107, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.00577957188943401, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9430.0, |
| "completions/max_terminated_length": 9430.0, |
| "completions/mean_length": 7677.46875, |
| "completions/mean_terminated_length": 7677.46875, |
| "completions/min_length": 1991.0, |
| "completions/min_terminated_length": 1991.0, |
| "entropy": 0.11121218441985548, |
| "epoch": 0.00336, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3994040489196777, |
| "kl": 0.027663854853017256, |
| "learning_rate": 7.74759286480186e-06, |
| "loss": -0.098, |
| "num_tokens": 23033870.0, |
| "reward": -0.3911195993423462, |
| "reward_std": 0.40892452001571655, |
| "rewards/rollout_reward_func/mean": -0.3911195993423462, |
| "rewards/rollout_reward_func/std": 0.49502092599868774, |
| "sampling/importance_sampling_ratio/max": 1.9817404747009277, |
| "sampling/importance_sampling_ratio/mean": 0.9991714954376221, |
| "sampling/importance_sampling_ratio/min": 0.32034245133399963, |
| "sampling/sampling_logp_difference/max": 1.1383646726608276, |
| "sampling/sampling_logp_difference/mean": 0.013728287070989609, |
| "step": 84, |
| "step_time": 93.50239179400523 |
| }, |
| { |
| "clip_ratio/high_max": 0.006439179298467934, |
| "clip_ratio/high_mean": 0.003566811908967793, |
| "clip_ratio/low_mean": 0.0006127451197244227, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004179557028692216, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9303.0, |
| "completions/max_terminated_length": 9303.0, |
| "completions/mean_length": 7925.1875, |
| "completions/mean_terminated_length": 7925.1875, |
| "completions/min_length": 1447.0, |
| "completions/min_terminated_length": 1447.0, |
| "entropy": 0.1154963904991746, |
| "epoch": 0.0034, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1012736558914185, |
| "kl": 0.01787613780470565, |
| "learning_rate": 7.737124273912181e-06, |
| "loss": -0.2728, |
| "num_tokens": 23313399.0, |
| "reward": -0.43042296171188354, |
| "reward_std": 0.3645261526107788, |
| "rewards/rollout_reward_func/mean": -0.43042296171188354, |
| "rewards/rollout_reward_func/std": 0.39467355608940125, |
| "sampling/importance_sampling_ratio/max": 1.962335228919983, |
| "sampling/importance_sampling_ratio/mean": 0.999538779258728, |
| "sampling/importance_sampling_ratio/min": 0.41583091020584106, |
| "sampling/sampling_logp_difference/max": 0.8774765729904175, |
| "sampling/sampling_logp_difference/mean": 0.011602518148720264, |
| "step": 85, |
| "step_time": 91.32576291800251 |
| }, |
| { |
| "clip_ratio/high_max": 0.0035516806528903544, |
| "clip_ratio/high_mean": 0.0021309539733920246, |
| "clip_ratio/low_mean": 0.0014086648297961801, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0035396188031882048, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9175.0, |
| "completions/max_terminated_length": 9175.0, |
| "completions/mean_length": 7601.90625, |
| "completions/mean_terminated_length": 7601.90625, |
| "completions/min_length": 272.0, |
| "completions/min_terminated_length": 272.0, |
| "entropy": 0.13196291960775852, |
| "epoch": 0.00344, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.298827052116394, |
| "kl": 0.013925580627983436, |
| "learning_rate": 7.72645291201851e-06, |
| "loss": -0.2696, |
| "num_tokens": 23582639.0, |
| "reward": -0.40399467945098877, |
| "reward_std": 0.350833535194397, |
| "rewards/rollout_reward_func/mean": -0.40399467945098877, |
| "rewards/rollout_reward_func/std": 0.40605658292770386, |
| "sampling/importance_sampling_ratio/max": 1.568974256515503, |
| "sampling/importance_sampling_ratio/mean": 0.9993267059326172, |
| "sampling/importance_sampling_ratio/min": 0.6157581806182861, |
| "sampling/sampling_logp_difference/max": 0.48490095138549805, |
| "sampling/sampling_logp_difference/mean": 0.011755777522921562, |
| "step": 86, |
| "step_time": 94.93100565299937 |
| }, |
| { |
| "clip_ratio/high_max": 0.005048143619205803, |
| "clip_ratio/high_mean": 0.0025240718096029013, |
| "clip_ratio/low_mean": 0.003231386741390452, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005755458580097184, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9284.0, |
| "completions/max_terminated_length": 9284.0, |
| "completions/mean_length": 7635.3125, |
| "completions/mean_terminated_length": 7635.3125, |
| "completions/min_length": 2500.0, |
| "completions/min_terminated_length": 2500.0, |
| "entropy": 0.11647467198781669, |
| "epoch": 0.00348, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1801884174346924, |
| "kl": 0.013889241206925362, |
| "learning_rate": 7.715579569674193e-06, |
| "loss": -0.1101, |
| "num_tokens": 23852759.0, |
| "reward": -0.4265971779823303, |
| "reward_std": 0.4005134105682373, |
| "rewards/rollout_reward_func/mean": -0.4265971779823303, |
| "rewards/rollout_reward_func/std": 0.46251291036605835, |
| "sampling/importance_sampling_ratio/max": 2.021784782409668, |
| "sampling/importance_sampling_ratio/mean": 1.0004267692565918, |
| "sampling/importance_sampling_ratio/min": 0.4357590675354004, |
| "sampling/sampling_logp_difference/max": 0.8306658864021301, |
| "sampling/sampling_logp_difference/mean": 0.011593816801905632, |
| "step": 87, |
| "step_time": 91.54987620599968 |
| }, |
| { |
| "clip_ratio/high_max": 0.0033402342814952135, |
| "clip_ratio/high_mean": 0.0016701171407476068, |
| "clip_ratio/low_mean": 0.0017763022624421865, |
| "clip_ratio/low_min": 0.0006944444612599909, |
| "clip_ratio/region_mean": 0.0034464194031897932, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9331.0, |
| "completions/max_terminated_length": 9331.0, |
| "completions/mean_length": 8468.625, |
| "completions/mean_terminated_length": 8468.625, |
| "completions/min_length": 2484.0, |
| "completions/min_terminated_length": 2484.0, |
| "entropy": 0.119684575824067, |
| "epoch": 0.00352, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3990496397018433, |
| "kl": 0.012226378137711436, |
| "learning_rate": 7.704505052395651e-06, |
| "loss": 0.1027, |
| "num_tokens": 24149881.0, |
| "reward": -0.1723448634147644, |
| "reward_std": 0.3936545252799988, |
| "rewards/rollout_reward_func/mean": -0.1723448634147644, |
| "rewards/rollout_reward_func/std": 0.48704293370246887, |
| "sampling/importance_sampling_ratio/max": 2.3026111125946045, |
| "sampling/importance_sampling_ratio/mean": 1.0012261867523193, |
| "sampling/importance_sampling_ratio/min": 0.3968614935874939, |
| "sampling/sampling_logp_difference/max": 0.9241679906845093, |
| "sampling/sampling_logp_difference/mean": 0.012234903872013092, |
| "step": 88, |
| "step_time": 95.13692331799757 |
| }, |
| { |
| "clip_ratio/high_max": 0.0035114133497700095, |
| "clip_ratio/high_mean": 0.002106830303091556, |
| "clip_ratio/low_mean": 0.0025690634502097964, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004675893753301352, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9274.0, |
| "completions/max_terminated_length": 9274.0, |
| "completions/mean_length": 7818.875, |
| "completions/mean_terminated_length": 7818.875, |
| "completions/min_length": 730.0, |
| "completions/min_terminated_length": 730.0, |
| "entropy": 0.13537772255949676, |
| "epoch": 0.00356, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3247586488723755, |
| "kl": 0.01799499534536153, |
| "learning_rate": 7.693230180602701e-06, |
| "loss": -0.1045, |
| "num_tokens": 24426080.0, |
| "reward": -0.4246547818183899, |
| "reward_std": 0.2930266261100769, |
| "rewards/rollout_reward_func/mean": -0.4246547818183899, |
| "rewards/rollout_reward_func/std": 0.40061336755752563, |
| "sampling/importance_sampling_ratio/max": 2.4912331104278564, |
| "sampling/importance_sampling_ratio/mean": 0.99860680103302, |
| "sampling/importance_sampling_ratio/min": 0.4037307798862457, |
| "sampling/sampling_logp_difference/max": 0.9127777814865112, |
| "sampling/sampling_logp_difference/mean": 0.014037063345313072, |
| "step": 89, |
| "step_time": 94.62805927399677 |
| }, |
| { |
| "clip_ratio/high_max": 0.007079427479766309, |
| "clip_ratio/high_mean": 0.0035397137398831546, |
| "clip_ratio/low_mean": 0.003276572242612019, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006816285924287513, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9564.0, |
| "completions/max_terminated_length": 9564.0, |
| "completions/mean_length": 8033.125, |
| "completions/mean_terminated_length": 8033.125, |
| "completions/min_length": 743.0, |
| "completions/min_terminated_length": 743.0, |
| "entropy": 0.111784094478935, |
| "epoch": 0.0036, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1404390335083008, |
| "kl": 0.0197194812935777, |
| "learning_rate": 7.68175578955777e-06, |
| "loss": -0.1376, |
| "num_tokens": 24709065.0, |
| "reward": -0.4281678795814514, |
| "reward_std": 0.3626987934112549, |
| "rewards/rollout_reward_func/mean": -0.4281678795814514, |
| "rewards/rollout_reward_func/std": 0.40464091300964355, |
| "sampling/importance_sampling_ratio/max": 2.2442386150360107, |
| "sampling/importance_sampling_ratio/mean": 0.9990909099578857, |
| "sampling/importance_sampling_ratio/min": 0.4673902690410614, |
| "sampling/sampling_logp_difference/max": 0.8083662986755371, |
| "sampling/sampling_logp_difference/mean": 0.011810792610049248, |
| "step": 90, |
| "step_time": 94.34466740900098 |
| }, |
| { |
| "clip_ratio/high_max": 0.007889641856309026, |
| "clip_ratio/high_mean": 0.004651058145100251, |
| "clip_ratio/low_mean": 0.0014329880068544298, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006084046239266172, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9008.0, |
| "completions/max_terminated_length": 9008.0, |
| "completions/mean_length": 7546.65625, |
| "completions/mean_terminated_length": 7546.65625, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "entropy": 0.10288556944578886, |
| "epoch": 0.00364, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.186630368232727, |
| "kl": 0.018865237798308954, |
| "learning_rate": 7.67008272930403e-06, |
| "loss": -0.2213, |
| "num_tokens": 24976474.0, |
| "reward": -0.4113379716873169, |
| "reward_std": 0.34501367807388306, |
| "rewards/rollout_reward_func/mean": -0.4113379716873169, |
| "rewards/rollout_reward_func/std": 0.375704288482666, |
| "sampling/importance_sampling_ratio/max": 2.0333220958709717, |
| "sampling/importance_sampling_ratio/mean": 1.000919222831726, |
| "sampling/importance_sampling_ratio/min": 0.3369899392127991, |
| "sampling/sampling_logp_difference/max": 1.0877022743225098, |
| "sampling/sampling_logp_difference/mean": 0.010551965795457363, |
| "step": 91, |
| "step_time": 93.05898203200377 |
| }, |
| { |
| "clip_ratio/high_max": 0.007170227414462715, |
| "clip_ratio/high_mean": 0.004291531280614436, |
| "clip_ratio/low_mean": 0.0003511235991027206, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004642654879717156, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9088.0, |
| "completions/max_terminated_length": 9088.0, |
| "completions/mean_length": 7282.0625, |
| "completions/mean_terminated_length": 7282.0625, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "entropy": 0.1037353971041739, |
| "epoch": 0.00368, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3811126947402954, |
| "kl": 0.01996720782153716, |
| "learning_rate": 7.658211864602414e-06, |
| "loss": -0.1225, |
| "num_tokens": 25235357.0, |
| "reward": -0.21767657995224, |
| "reward_std": 0.556893527507782, |
| "rewards/rollout_reward_func/mean": -0.21767657995224, |
| "rewards/rollout_reward_func/std": 0.6004104614257812, |
| "sampling/importance_sampling_ratio/max": 2.3581435680389404, |
| "sampling/importance_sampling_ratio/mean": 1.0017621517181396, |
| "sampling/importance_sampling_ratio/min": 0.07947485893964767, |
| "sampling/sampling_logp_difference/max": 2.5323145389556885, |
| "sampling/sampling_logp_difference/mean": 0.012833436019718647, |
| "step": 92, |
| "step_time": 91.07534395600123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0027935606776736677, |
| "clip_ratio/high_mean": 0.0013967803388368338, |
| "clip_ratio/low_mean": 0.002134947048034519, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0035317273577675223, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9362.0, |
| "completions/max_terminated_length": 9362.0, |
| "completions/mean_length": 8238.3125, |
| "completions/mean_terminated_length": 8238.3125, |
| "completions/min_length": 614.0, |
| "completions/min_terminated_length": 614.0, |
| "entropy": 0.0987412256654352, |
| "epoch": 0.00372, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2942229509353638, |
| "kl": 0.014109234602074139, |
| "learning_rate": 7.64614407486756e-06, |
| "loss": -0.1174, |
| "num_tokens": 25524816.0, |
| "reward": -0.4631078839302063, |
| "reward_std": 0.36089861392974854, |
| "rewards/rollout_reward_func/mean": -0.4631078839302063, |
| "rewards/rollout_reward_func/std": 0.421775758266449, |
| "sampling/importance_sampling_ratio/max": 2.2098324298858643, |
| "sampling/importance_sampling_ratio/mean": 1.001816749572754, |
| "sampling/importance_sampling_ratio/min": 0.5984537601470947, |
| "sampling/sampling_logp_difference/max": 0.7929167747497559, |
| "sampling/sampling_logp_difference/mean": 0.01035095565021038, |
| "step": 93, |
| "step_time": 95.76501873699817 |
| }, |
| { |
| "clip_ratio/high_max": 0.004278437350876629, |
| "clip_ratio/high_mean": 0.0021392186754383147, |
| "clip_ratio/low_mean": 0.0011272609990555793, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003266479674493894, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9301.0, |
| "completions/max_terminated_length": 9301.0, |
| "completions/mean_length": 7563.09375, |
| "completions/mean_terminated_length": 7563.09375, |
| "completions/min_length": 277.0, |
| "completions/min_terminated_length": 277.0, |
| "entropy": 0.1352407243102789, |
| "epoch": 0.00376, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2077804803848267, |
| "kl": 0.033299635659204796, |
| "learning_rate": 7.633880254102664e-06, |
| "loss": -0.1037, |
| "num_tokens": 25792762.0, |
| "reward": -0.487395703792572, |
| "reward_std": 0.4409431219100952, |
| "rewards/rollout_reward_func/mean": -0.487395703792572, |
| "rewards/rollout_reward_func/std": 0.45345383882522583, |
| "sampling/importance_sampling_ratio/max": 2.2063913345336914, |
| "sampling/importance_sampling_ratio/mean": 1.001375436782837, |
| "sampling/importance_sampling_ratio/min": 0.2881685495376587, |
| "sampling/sampling_logp_difference/max": 1.2442097663879395, |
| "sampling/sampling_logp_difference/mean": 0.01331554725766182, |
| "step": 94, |
| "step_time": 91.56602428999759 |
| }, |
| { |
| "clip_ratio/high_max": 0.005700222682207823, |
| "clip_ratio/high_mean": 0.0032012349111028016, |
| "clip_ratio/low_mean": 0.0008800287614576519, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004081263672560453, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9186.0, |
| "completions/max_terminated_length": 9186.0, |
| "completions/mean_length": 7445.5, |
| "completions/mean_terminated_length": 7445.5, |
| "completions/min_length": 307.0, |
| "completions/min_terminated_length": 307.0, |
| "entropy": 0.12523270677775145, |
| "epoch": 0.0038, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1024125814437866, |
| "kl": 0.015170263191976119, |
| "learning_rate": 7.621421310833242e-06, |
| "loss": -0.2471, |
| "num_tokens": 26056916.0, |
| "reward": -0.47263962030410767, |
| "reward_std": 0.4123695492744446, |
| "rewards/rollout_reward_func/mean": -0.47263962030410767, |
| "rewards/rollout_reward_func/std": 0.42915722727775574, |
| "sampling/importance_sampling_ratio/max": 1.9018123149871826, |
| "sampling/importance_sampling_ratio/mean": 0.9993163347244263, |
| "sampling/importance_sampling_ratio/min": 0.37924882769584656, |
| "sampling/sampling_logp_difference/max": 0.9695627689361572, |
| "sampling/sampling_logp_difference/mean": 0.013316385447978973, |
| "step": 95, |
| "step_time": 91.25094345499565 |
| }, |
| { |
| "clip_ratio/high_max": 0.00731422781245783, |
| "clip_ratio/high_mean": 0.004351901414338499, |
| "clip_ratio/low_mean": 0.0017361111822538078, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006088012654799968, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9454.0, |
| "completions/max_terminated_length": 9454.0, |
| "completions/mean_length": 7747.3125, |
| "completions/mean_terminated_length": 7747.3125, |
| "completions/min_length": 3419.0, |
| "completions/min_terminated_length": 3419.0, |
| "entropy": 0.12114885123446584, |
| "epoch": 0.00384, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.179085373878479, |
| "kl": 0.02324743496137671, |
| "learning_rate": 7.608768168039832e-06, |
| "loss": -0.1271, |
| "num_tokens": 26330673.0, |
| "reward": -0.37598949670791626, |
| "reward_std": 0.43450477719306946, |
| "rewards/rollout_reward_func/mean": -0.37598949670791626, |
| "rewards/rollout_reward_func/std": 0.5974510908126831, |
| "sampling/importance_sampling_ratio/max": 1.950088381767273, |
| "sampling/importance_sampling_ratio/mean": 0.9992377161979675, |
| "sampling/importance_sampling_ratio/min": 0.35387420654296875, |
| "sampling/sampling_logp_difference/max": 1.038813829421997, |
| "sampling/sampling_logp_difference/mean": 0.01316380687057972, |
| "step": 96, |
| "step_time": 92.53011242599314 |
| }, |
| { |
| "clip_ratio/high_max": 0.004979305085726082, |
| "clip_ratio/high_mean": 0.002489652542863041, |
| "clip_ratio/low_mean": 0.002151657157810405, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004641309700673446, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9315.0, |
| "completions/max_terminated_length": 9315.0, |
| "completions/mean_length": 8012.21875, |
| "completions/mean_terminated_length": 8012.21875, |
| "completions/min_length": 381.0, |
| "completions/min_terminated_length": 381.0, |
| "entropy": 0.13530251197516918, |
| "epoch": 0.00388, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1584080457687378, |
| "kl": 0.02002575868391432, |
| "learning_rate": 7.5959217630896185e-06, |
| "loss": -0.1398, |
| "num_tokens": 26612879.0, |
| "reward": -0.3609606921672821, |
| "reward_std": 0.31441646814346313, |
| "rewards/rollout_reward_func/mean": -0.3609606921672821, |
| "rewards/rollout_reward_func/std": 0.3982548713684082, |
| "sampling/importance_sampling_ratio/max": 1.962359070777893, |
| "sampling/importance_sampling_ratio/mean": 1.0006780624389648, |
| "sampling/importance_sampling_ratio/min": 0.5123910307884216, |
| "sampling/sampling_logp_difference/max": 0.674147367477417, |
| "sampling/sampling_logp_difference/mean": 0.013763591647148132, |
| "step": 97, |
| "step_time": 93.49124078800196 |
| }, |
| { |
| "clip_ratio/high_max": 0.00703950843308121, |
| "clip_ratio/high_mean": 0.0038748678634874523, |
| "clip_ratio/low_mean": 0.0023320690961554646, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.006206936959642917, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9290.0, |
| "completions/max_terminated_length": 9290.0, |
| "completions/mean_length": 7702.40625, |
| "completions/mean_terminated_length": 7702.40625, |
| "completions/min_length": 754.0, |
| "completions/min_terminated_length": 754.0, |
| "entropy": 0.13701792433857918, |
| "epoch": 0.00392, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3185031414031982, |
| "kl": 0.026673564861994237, |
| "learning_rate": 7.5828830476669816e-06, |
| "loss": -0.2084, |
| "num_tokens": 26885204.0, |
| "reward": -0.4948327839374542, |
| "reward_std": 0.31638196110725403, |
| "rewards/rollout_reward_func/mean": -0.4948327839374542, |
| "rewards/rollout_reward_func/std": 0.42699867486953735, |
| "sampling/importance_sampling_ratio/max": 1.5802547931671143, |
| "sampling/importance_sampling_ratio/mean": 0.9976853132247925, |
| "sampling/importance_sampling_ratio/min": 0.4126710891723633, |
| "sampling/sampling_logp_difference/max": 0.8851044178009033, |
| "sampling/sampling_logp_difference/mean": 0.015071339905261993, |
| "step": 98, |
| "step_time": 91.84920660200441 |
| }, |
| { |
| "clip_ratio/high_max": 0.003422708949074149, |
| "clip_ratio/high_mean": 0.0017113544745370746, |
| "clip_ratio/low_mean": 0.0013361009187065065, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003047455393243581, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9112.0, |
| "completions/max_terminated_length": 9112.0, |
| "completions/mean_length": 7679.53125, |
| "completions/mean_terminated_length": 7679.53125, |
| "completions/min_length": 894.0, |
| "completions/min_terminated_length": 894.0, |
| "entropy": 0.10829294635914266, |
| "epoch": 0.00396, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2493257522583008, |
| "kl": 0.015406116601297981, |
| "learning_rate": 7.569652987703011e-06, |
| "loss": -0.0216, |
| "num_tokens": 27156852.0, |
| "reward": -0.3716369867324829, |
| "reward_std": 0.49375930428504944, |
| "rewards/rollout_reward_func/mean": -0.3716369867324829, |
| "rewards/rollout_reward_func/std": 0.5273211598396301, |
| "sampling/importance_sampling_ratio/max": 1.4128153324127197, |
| "sampling/importance_sampling_ratio/mean": 0.9987783432006836, |
| "sampling/importance_sampling_ratio/min": 0.35299214720726013, |
| "sampling/sampling_logp_difference/max": 1.0413094758987427, |
| "sampling/sampling_logp_difference/mean": 0.010902078822255135, |
| "step": 99, |
| "step_time": 90.7731563160014 |
| }, |
| { |
| "clip_ratio/high_max": 0.007757734332699329, |
| "clip_ratio/high_mean": 0.0038788671663496643, |
| "clip_ratio/low_mean": 0.00035511364694684744, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.004233980813296512, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9181.0, |
| "completions/max_terminated_length": 9181.0, |
| "completions/mean_length": 8015.375, |
| "completions/mean_terminated_length": 8015.375, |
| "completions/min_length": 290.0, |
| "completions/min_terminated_length": 290.0, |
| "entropy": 0.1193245961330831, |
| "epoch": 0.004, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2326804399490356, |
| "kl": 0.016626078002445865, |
| "learning_rate": 7.5562325633039275e-06, |
| "loss": -0.167, |
| "num_tokens": 27439174.0, |
| "reward": -0.42884576320648193, |
| "reward_std": 0.32799845933914185, |
| "rewards/rollout_reward_func/mean": -0.42884576320648193, |
| "rewards/rollout_reward_func/std": 0.3549758791923523, |
| "sampling/importance_sampling_ratio/max": 1.9659780263900757, |
| "sampling/importance_sampling_ratio/mean": 0.9990392923355103, |
| "sampling/importance_sampling_ratio/min": 0.2670063078403473, |
| "sampling/sampling_logp_difference/max": 1.3204829692840576, |
| "sampling/sampling_logp_difference/mean": 0.013362506404519081, |
| "step": 100, |
| "step_time": 94.30247499899633 |
| }, |
| { |
| "clip_ratio/high_max": 0.01159447367535904, |
| "clip_ratio/high_mean": 0.00579723683767952, |
| "clip_ratio/low_mean": 0.0007812500116415322, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.006578486849321052, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9356.0, |
| "completions/max_terminated_length": 9356.0, |
| "completions/mean_length": 8226.75, |
| "completions/mean_terminated_length": 8226.75, |
| "completions/min_length": 3396.0, |
| "completions/min_terminated_length": 3396.0, |
| "entropy": 0.12249888107180595, |
| "epoch": 0.00404, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.0833367109298706, |
| "kl": 0.01735936271143146, |
| "learning_rate": 7.542622768678494e-06, |
| "loss": 0.0171, |
| "num_tokens": 27728261.0, |
| "reward": -0.3308451175689697, |
| "reward_std": 0.5605739951133728, |
| "rewards/rollout_reward_func/mean": -0.3308451175689697, |
| "rewards/rollout_reward_func/std": 0.5798808932304382, |
| "sampling/importance_sampling_ratio/max": 1.5370640754699707, |
| "sampling/importance_sampling_ratio/mean": 1.0012454986572266, |
| "sampling/importance_sampling_ratio/min": 0.5971825122833252, |
| "sampling/sampling_logp_difference/max": 0.5155324935913086, |
| "sampling/sampling_logp_difference/mean": 0.011695077642798424, |
| "step": 101, |
| "step_time": 95.23633311199774 |
| }, |
| { |
| "clip_ratio/high_max": 0.002826430252753198, |
| "clip_ratio/high_mean": 0.001413215126376599, |
| "clip_ratio/low_mean": 0.001412654877640307, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.002825870004016906, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9459.0, |
| "completions/max_terminated_length": 9459.0, |
| "completions/mean_length": 8216.125, |
| "completions/mean_terminated_length": 8216.125, |
| "completions/min_length": 1191.0, |
| "completions/min_terminated_length": 1191.0, |
| "entropy": 0.12594126863405108, |
| "epoch": 0.00408, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.3509782552719116, |
| "kl": 0.02238677909190301, |
| "learning_rate": 7.528824612064348e-06, |
| "loss": -0.0679, |
| "num_tokens": 28017012.0, |
| "reward": -0.3804386854171753, |
| "reward_std": 0.3933500647544861, |
| "rewards/rollout_reward_func/mean": -0.3804386854171753, |
| "rewards/rollout_reward_func/std": 0.4109920263290405, |
| "sampling/importance_sampling_ratio/max": 2.3366055488586426, |
| "sampling/importance_sampling_ratio/mean": 1.0009369850158691, |
| "sampling/importance_sampling_ratio/min": 0.46159544587135315, |
| "sampling/sampling_logp_difference/max": 0.8486993312835693, |
| "sampling/sampling_logp_difference/mean": 0.014044486917555332, |
| "step": 102, |
| "step_time": 95.00174767700264 |
| }, |
| { |
| "clip_ratio/high_max": 0.0028175070183351636, |
| "clip_ratio/high_mean": 0.0024504201719537377, |
| "clip_ratio/low_mean": 0.0021656582539435476, |
| "clip_ratio/low_min": 0.0006944444612599909, |
| "clip_ratio/region_mean": 0.004616078425897285, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9223.0, |
| "completions/max_terminated_length": 9223.0, |
| "completions/mean_length": 7621.125, |
| "completions/mean_terminated_length": 7621.125, |
| "completions/min_length": 1625.0, |
| "completions/min_terminated_length": 1625.0, |
| "entropy": 0.11429717438295484, |
| "epoch": 0.00412, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.284745216369629, |
| "kl": 0.04166489985072985, |
| "learning_rate": 7.5148391156533234e-06, |
| "loss": -0.0986, |
| "num_tokens": 28286718.0, |
| "reward": -0.3517298400402069, |
| "reward_std": 0.44646015763282776, |
| "rewards/rollout_reward_func/mean": -0.3517298400402069, |
| "rewards/rollout_reward_func/std": 0.5219200253486633, |
| "sampling/importance_sampling_ratio/max": 2.0663206577301025, |
| "sampling/importance_sampling_ratio/mean": 1.0006217956542969, |
| "sampling/importance_sampling_ratio/min": 0.3164287209510803, |
| "sampling/sampling_logp_difference/max": 1.150657296180725, |
| "sampling/sampling_logp_difference/mean": 0.016996370628476143, |
| "step": 103, |
| "step_time": 89.48946891300147 |
| }, |
| { |
| "clip_ratio/high_max": 0.006022654706612229, |
| "clip_ratio/high_mean": 0.003717744955793023, |
| "clip_ratio/low_mean": 0.0033087176270782948, |
| "clip_ratio/low_min": 0.0007022471982054412, |
| "clip_ratio/region_mean": 0.007026462582871318, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9336.0, |
| "completions/max_terminated_length": 9336.0, |
| "completions/mean_length": 7775.3125, |
| "completions/mean_terminated_length": 7775.3125, |
| "completions/min_length": 288.0, |
| "completions/min_terminated_length": 288.0, |
| "entropy": 0.11511457245796919, |
| "epoch": 0.00416, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1231155395507812, |
| "kl": 0.02951129319262691, |
| "learning_rate": 7.500667315515709e-06, |
| "loss": -0.1068, |
| "num_tokens": 28561357.0, |
| "reward": -0.39338138699531555, |
| "reward_std": 0.42901262640953064, |
| "rewards/rollout_reward_func/mean": -0.39338138699531555, |
| "rewards/rollout_reward_func/std": 0.48031023144721985, |
| "sampling/importance_sampling_ratio/max": 2.4638149738311768, |
| "sampling/importance_sampling_ratio/mean": 0.9997503757476807, |
| "sampling/importance_sampling_ratio/min": 0.5230858325958252, |
| "sampling/sampling_logp_difference/max": 0.9017109870910645, |
| "sampling/sampling_logp_difference/mean": 0.012721032835543156, |
| "step": 104, |
| "step_time": 93.24399638200339 |
| }, |
| { |
| "clip_ratio/high_max": 0.0027933833189308643, |
| "clip_ratio/high_mean": 0.0013966916594654322, |
| "clip_ratio/low_mean": 0.0025065841618925333, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.003903275792254135, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9330.0, |
| "completions/max_terminated_length": 9330.0, |
| "completions/mean_length": 8119.09375, |
| "completions/mean_terminated_length": 8119.09375, |
| "completions/min_length": 3061.0, |
| "completions/min_terminated_length": 3061.0, |
| "entropy": 0.11791149899363518, |
| "epoch": 0.0042, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.2128372192382812, |
| "kl": 0.027731850394047797, |
| "learning_rate": 7.486310261523511e-06, |
| "loss": 0.0399, |
| "num_tokens": 28847024.0, |
| "reward": -0.3779003918170929, |
| "reward_std": 0.5017719268798828, |
| "rewards/rollout_reward_func/mean": -0.3779003918170929, |
| "rewards/rollout_reward_func/std": 0.5323984622955322, |
| "sampling/importance_sampling_ratio/max": 1.925367832183838, |
| "sampling/importance_sampling_ratio/mean": 1.0002028942108154, |
| "sampling/importance_sampling_ratio/min": 0.3569715917110443, |
| "sampling/sampling_logp_difference/max": 1.0300991535186768, |
| "sampling/sampling_logp_difference/mean": 0.01532074436545372, |
| "step": 105, |
| "step_time": 93.38768337700094 |
| }, |
| { |
| "clip_ratio/high_max": 0.005748430092353374, |
| "clip_ratio/high_mean": 0.002874215046176687, |
| "clip_ratio/low_mean": 0.002773832093225792, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.005648046964779496, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9337.0, |
| "completions/max_terminated_length": 9337.0, |
| "completions/mean_length": 7877.40625, |
| "completions/mean_terminated_length": 7877.40625, |
| "completions/min_length": 1862.0, |
| "completions/min_terminated_length": 1862.0, |
| "entropy": 0.10635959357023239, |
| "epoch": 0.00424, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 1.1976176500320435, |
| "kl": 0.030401008145418018, |
| "learning_rate": 7.471769017272662e-06, |
| "loss": 0.0169, |
| "num_tokens": 29124980.0, |
| "reward": -0.40934568643569946, |
| "reward_std": 0.4898146986961365, |
| "rewards/rollout_reward_func/mean": -0.40934568643569946, |
| "rewards/rollout_reward_func/std": 0.5182675719261169, |
| "sampling/importance_sampling_ratio/max": 1.8533921241760254, |
| "sampling/importance_sampling_ratio/mean": 0.9990241527557373, |
| "sampling/importance_sampling_ratio/min": 0.43001580238342285, |
| "sampling/sampling_logp_difference/max": 0.8439333438873291, |
| "sampling/sampling_logp_difference/mean": 0.01345333456993103, |
| "step": 106, |
| "step_time": 92.02846917199531 |
| } |
| ], |
| "logging_steps": 1.0, |
| "max_steps": 400, |
| "num_input_tokens_seen": 29124980, |
| "num_train_epochs": 1, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 1, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|