| { |
| "best_global_step": null, |
| "best_metric": 0.8869044184684753, |
| "best_model_checkpoint": null, |
| "epoch": 0.13192612137203166, |
| "eval_steps": 100, |
| "global_step": 400, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 15.6, |
| "completions/mean_length": 31.29375, |
| "completions/mean_terminated_length": 13.3, |
| "completions/min_length": 11.0, |
| "completions/min_terminated_length": 11.0, |
| "epoch": 0.0016490765171503958, |
| "grad_norm": 1.0301371812820435, |
| "kl": 0.000438690185546875, |
| "learning_rate": 8.000000000000001e-06, |
| "loss": 0.02, |
| "num_tokens": 14709.0, |
| "reward": 15.14717788696289, |
| "reward_std": 2.122625803947449, |
| "rewards/conciseness_reward/mean": 3.1040622711181642, |
| "rewards/conciseness_reward/std": 1.0407999098300933, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 8.2, |
| "completions/mean_length": 31.4125, |
| "completions/mean_terminated_length": 5.0, |
| "completions/min_length": 19.6, |
| "completions/min_terminated_length": 0.4, |
| "epoch": 0.0032981530343007917, |
| "grad_norm": 1.00534188747406, |
| "kl": 0.0009120941162109375, |
| "learning_rate": 1.8e-05, |
| "loss": 0.0162, |
| "num_tokens": 31719.0, |
| "reward": 15.080268859863281, |
| "reward_std": 1.7717459440231322, |
| "rewards/conciseness_reward/mean": 3.090350866317749, |
| "rewards/conciseness_reward/std": 0.7454259812831878, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 11.8, |
| "completions/mean_length": 31.375, |
| "completions/mean_terminated_length": 11.8, |
| "completions/min_length": 18.2, |
| "completions/min_terminated_length": 11.8, |
| "epoch": 0.004947229551451188, |
| "grad_norm": 1.1287697553634644, |
| "kl": 0.002832794189453125, |
| "learning_rate": 2.8000000000000003e-05, |
| "loss": 0.0068, |
| "num_tokens": 49875.0, |
| "reward": 15.289332962036132, |
| "reward_std": 1.7224721908569336, |
| "rewards/conciseness_reward/mean": 3.133193778991699, |
| "rewards/conciseness_reward/std": 1.0818009793758392, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 11.4, |
| "completions/mean_length": 30.39375, |
| "completions/mean_terminated_length": 7.133333468437195, |
| "completions/min_length": 11.4, |
| "completions/min_terminated_length": 5.0, |
| "epoch": 0.006596306068601583, |
| "grad_norm": 1.091719627380371, |
| "kl": 0.01212921142578125, |
| "learning_rate": 3.8e-05, |
| "loss": 0.0123, |
| "num_tokens": 65538.0, |
| "reward": 16.047473907470703, |
| "reward_std": 1.4964761018753052, |
| "rewards/conciseness_reward/mean": 3.288557195663452, |
| "rewards/conciseness_reward/std": 1.2817005276679994, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.95, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 21.6, |
| "completions/mean_length": 31.35625, |
| "completions/mean_terminated_length": 20.3, |
| "completions/min_length": 19.0, |
| "completions/min_terminated_length": 19.0, |
| "epoch": 0.008245382585751979, |
| "grad_norm": 1.1559650897979736, |
| "kl": 0.0252685546875, |
| "learning_rate": 4.8e-05, |
| "loss": 0.0148, |
| "num_tokens": 81341.0, |
| "reward": 15.850406646728516, |
| "reward_std": 1.5481059789657592, |
| "rewards/conciseness_reward/mean": 3.2481727600097656, |
| "rewards/conciseness_reward/std": 0.7157365679740906, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.95625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 11.2, |
| "completions/mean_length": 31.24375, |
| "completions/mean_terminated_length": 9.4, |
| "completions/min_length": 19.6, |
| "completions/min_terminated_length": 6.8, |
| "epoch": 0.009894459102902375, |
| "grad_norm": 1.179612636566162, |
| "kl": 0.041455078125, |
| "learning_rate": 5.8e-05, |
| "loss": 0.0168, |
| "num_tokens": 96376.0, |
| "reward": 16.42060241699219, |
| "reward_std": 2.5188846826553344, |
| "rewards/conciseness_reward/mean": 3.3650211811065676, |
| "rewards/conciseness_reward/std": 0.9817604303359986, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.94375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 9.6, |
| "completions/mean_length": 30.7375, |
| "completions/mean_terminated_length": 5.9333335876464846, |
| "completions/min_length": 7.2, |
| "completions/min_terminated_length": 0.8, |
| "epoch": 0.01154353562005277, |
| "grad_norm": 1.2418527603149414, |
| "kl": 0.0891357421875, |
| "learning_rate": 6.800000000000001e-05, |
| "loss": 0.0289, |
| "num_tokens": 110096.0, |
| "reward": 17.450071144104005, |
| "reward_std": 2.6924588203430178, |
| "rewards/conciseness_reward/mean": 3.5759867668151855, |
| "rewards/conciseness_reward/std": 1.3786604046821593, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.79375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 21.4, |
| "completions/mean_length": 26.95, |
| "completions/mean_terminated_length": 8.594264364242553, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.013192612137203167, |
| "grad_norm": 1.533212661743164, |
| "kl": 0.17685546875, |
| "learning_rate": 7.800000000000001e-05, |
| "loss": 0.0944, |
| "num_tokens": 125832.0, |
| "reward": 22.291907501220702, |
| "reward_std": 5.0477148532867435, |
| "rewards/conciseness_reward/mean": 4.568208789825439, |
| "rewards/conciseness_reward/std": 2.234351325035095, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.525, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 25.2, |
| "completions/mean_length": 18.95625, |
| "completions/mean_terminated_length": 4.880657196044922, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.014841688654353561, |
| "grad_norm": 2.227647304534912, |
| "kl": 0.81318359375, |
| "learning_rate": 8.800000000000001e-05, |
| "loss": 0.2385, |
| "num_tokens": 138589.0, |
| "reward": 31.8361515045166, |
| "reward_std": 6.978989696502685, |
| "rewards/conciseness_reward/mean": 6.524080085754394, |
| "rewards/conciseness_reward/std": 2.7887794971466064, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1375, |
| "completions/max_length": 26.0, |
| "completions/max_terminated_length": 16.6, |
| "completions/mean_length": 6.43125, |
| "completions/mean_terminated_length": 2.5217425346374513, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.016490765171503958, |
| "grad_norm": 0.19684094190597534, |
| "kl": 3.301953125, |
| "learning_rate": 9.8e-05, |
| "loss": 0.3463, |
| "num_tokens": 151914.0, |
| "reward": 43.79222412109375, |
| "reward_std": 4.306332683563232, |
| "rewards/conciseness_reward/mean": 8.974199676513672, |
| "rewards/conciseness_reward/std": 1.7521500557661056, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 9.0, |
| "completions/max_terminated_length": 9.0, |
| "completions/mean_length": 1.31875, |
| "completions/mean_terminated_length": 1.31875, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.018139841688654353, |
| "grad_norm": 0.013599345460534096, |
| "kl": 6.190625, |
| "learning_rate": 0.00010800000000000001, |
| "loss": 0.3072, |
| "num_tokens": 163061.0, |
| "reward": 48.47799987792969, |
| "reward_std": 0.45243007838726046, |
| "rewards/conciseness_reward/mean": 9.934440612792969, |
| "rewards/conciseness_reward/std": 0.28381501138210297, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.01978891820580475, |
| "grad_norm": 0.0023122939746826887, |
| "kl": 6.559375, |
| "learning_rate": 0.000118, |
| "loss": 0.2624, |
| "num_tokens": 173693.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.2, |
| "completions/max_terminated_length": 1.2, |
| "completions/mean_length": 1.00625, |
| "completions/mean_terminated_length": 1.00625, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.021437994722955146, |
| "grad_norm": 0.0008344887173734605, |
| "kl": 28.259375, |
| "learning_rate": 0.00012800000000000002, |
| "loss": 1.1298, |
| "num_tokens": 183802.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.02308707124010554, |
| "grad_norm": 0.011819743551313877, |
| "kl": 6.371875, |
| "learning_rate": 0.000138, |
| "loss": 0.2552, |
| "num_tokens": 195132.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.4, |
| "completions/max_terminated_length": 1.4, |
| "completions/mean_length": 1.01875, |
| "completions/mean_terminated_length": 1.01875, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.024736147757255935, |
| "grad_norm": 0.02948692999780178, |
| "kl": 6.521875, |
| "learning_rate": 0.000148, |
| "loss": 0.2609, |
| "num_tokens": 204273.0, |
| "reward": 48.74246368408203, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 9.98863639831543, |
| "rewards/conciseness_reward/std": 0.044715401530265805, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.01875, |
| "completions/max_length": 21.8, |
| "completions/max_terminated_length": 6.4, |
| "completions/mean_length": 1.76875, |
| "completions/mean_terminated_length": 1.191330623626709, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.026385224274406333, |
| "grad_norm": 0.6150962114334106, |
| "kl": 5.475, |
| "learning_rate": 0.00015800000000000002, |
| "loss": 0.3071, |
| "num_tokens": 217108.0, |
| "reward": 48.03885269165039, |
| "reward_std": 1.0734764248132707, |
| "rewards/conciseness_reward/mean": 9.844447708129882, |
| "rewards/conciseness_reward/std": 0.7437670588493347, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.028034300791556728, |
| "grad_norm": 0.004543509799987078, |
| "kl": 6.296875, |
| "learning_rate": 0.000168, |
| "loss": 0.2519, |
| "num_tokens": 228886.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.4, |
| "completions/max_terminated_length": 1.4, |
| "completions/mean_length": 1.025, |
| "completions/mean_terminated_length": 1.025, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.029683377308707123, |
| "grad_norm": 0.004970578011125326, |
| "kl": 6.803125, |
| "learning_rate": 0.00017800000000000002, |
| "loss": 0.2777, |
| "num_tokens": 238962.0, |
| "reward": 48.71935882568359, |
| "reward_std": 0.11109672784805298, |
| "rewards/conciseness_reward/mean": 9.983901596069336, |
| "rewards/conciseness_reward/std": 0.06620492339134217, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.6, |
| "completions/max_terminated_length": 1.6, |
| "completions/mean_length": 1.01875, |
| "completions/mean_terminated_length": 1.01875, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.03133245382585752, |
| "grad_norm": 1.1012120246887207, |
| "kl": 6.246875, |
| "learning_rate": 0.000188, |
| "loss": 0.2593, |
| "num_tokens": 250757.0, |
| "reward": 48.74246444702148, |
| "reward_std": 0.07842119336128235, |
| "rewards/conciseness_reward/mean": 9.98863639831543, |
| "rewards/conciseness_reward/std": 0.06428244113922119, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.032981530343007916, |
| "grad_norm": 0.030409209430217743, |
| "learning_rate": 0.00019800000000000002, |
| "loss": 0.2568, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.032981530343007916, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.0, |
| "eval_completions/max_length": 1.0934065934065933, |
| "eval_completions/max_terminated_length": 1.0934065934065933, |
| "eval_completions/mean_length": 1.0061813186813187, |
| "eval_completions/mean_terminated_length": 1.0061813186813187, |
| "eval_completions/min_length": 1.0, |
| "eval_completions/min_terminated_length": 1.0, |
| "eval_kl": 6.581902472527473, |
| "eval_loss": 0.26390206813812256, |
| "eval_num_tokens": 260814.0, |
| "eval_reward": 48.79334613255092, |
| "eval_reward_std": 0.006463285167138655, |
| "eval_rewards/conciseness_reward/mean": 9.999063439421601, |
| "eval_rewards/conciseness_reward/std": 0.0037462543491478804, |
| "eval_rewards/reward_func_correct_answer/mean": 0.0, |
| "eval_rewards/reward_func_correct_answer/std": 0.0, |
| "eval_rewards/reward_func_keywords/mean": 0.0, |
| "eval_rewards/reward_func_keywords/std": 0.0, |
| "eval_runtime": 27.0556, |
| "eval_samples_per_second": 53.778, |
| "eval_steps_per_second": 3.363, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.2, |
| "completions/max_terminated_length": 1.2, |
| "completions/mean_length": 1.00625, |
| "completions/mean_terminated_length": 1.00625, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.034630606860158314, |
| "grad_norm": 0.0008123432635329664, |
| "kl": 6.45625, |
| "learning_rate": 0.00019999025240093044, |
| "loss": 0.2631, |
| "num_tokens": 271803.0, |
| "reward": 48.78405342102051, |
| "reward_std": 0.019605298340320588, |
| "rewards/conciseness_reward/mean": 9.997159099578857, |
| "rewards/conciseness_reward/std": 0.016070610284805296, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.036279683377308705, |
| "grad_norm": 0.00020162259170319885, |
| "kl": 6.33125, |
| "learning_rate": 0.00019995065603657316, |
| "loss": 0.2533, |
| "num_tokens": 284121.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.0379287598944591, |
| "grad_norm": 0.00028325500898063183, |
| "kl": 6.35625, |
| "learning_rate": 0.0001998806137341434, |
| "loss": 0.2545, |
| "num_tokens": 296089.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.0395778364116095, |
| "grad_norm": 0.00010048302647192031, |
| "kl": 6.546875, |
| "learning_rate": 0.000199780146829205, |
| "loss": 0.2619, |
| "num_tokens": 308593.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04122691292875989, |
| "grad_norm": 5.4579424613621086e-05, |
| "kl": 6.2625, |
| "learning_rate": 0.00019964928592495045, |
| "loss": 0.2505, |
| "num_tokens": 319585.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04287598944591029, |
| "grad_norm": 5.583846723311581e-05, |
| "kl": 6.171875, |
| "learning_rate": 0.00019948807088287883, |
| "loss": 0.2469, |
| "num_tokens": 330515.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04452506596306069, |
| "grad_norm": 3.738172017619945e-05, |
| "kl": 6.175, |
| "learning_rate": 0.0001992965508106537, |
| "loss": 0.247, |
| "num_tokens": 341113.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04617414248021108, |
| "grad_norm": 3.934466440114193e-05, |
| "kl": 6.465625, |
| "learning_rate": 0.00019907478404714436, |
| "loss": 0.2587, |
| "num_tokens": 351807.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04782321899736148, |
| "grad_norm": 0.00020795360615011305, |
| "kl": 6.440625, |
| "learning_rate": 0.0001988228381446553, |
| "loss": 0.2575, |
| "num_tokens": 362049.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.04947229551451187, |
| "grad_norm": 3.9276594179682434e-05, |
| "kl": 6.415625, |
| "learning_rate": 0.00019854078984834903, |
| "loss": 0.2569, |
| "num_tokens": 372893.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.05112137203166227, |
| "grad_norm": 7.476914470316842e-05, |
| "kl": 6.3125, |
| "learning_rate": 0.0001982287250728689, |
| "loss": 0.2526, |
| "num_tokens": 383645.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 1.0, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.052770448548812667, |
| "grad_norm": 0.0009598923497833312, |
| "kl": 6.41875, |
| "learning_rate": 0.0001978867388761685, |
| "loss": 0.257, |
| "num_tokens": 394303.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0, |
| "completions/max_length": 1.2, |
| "completions/max_terminated_length": 1.2, |
| "completions/mean_length": 1.00625, |
| "completions/mean_terminated_length": 1.00625, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.05441952506596306, |
| "grad_norm": 0.0006444460013881326, |
| "kl": 6.83125, |
| "learning_rate": 0.00019751493543055632, |
| "loss": 0.2732, |
| "num_tokens": 405606.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 13.8, |
| "completions/max_terminated_length": 11.2, |
| "completions/mean_length": 4.44375, |
| "completions/mean_terminated_length": 2.8541286468505858, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.056068601583113456, |
| "grad_norm": 0.2734026610851288, |
| "kl": 6.778125, |
| "learning_rate": 0.00019711342799096361, |
| "loss": 0.2783, |
| "num_tokens": 417897.0, |
| "reward": 48.74246444702148, |
| "reward_std": 0.07842119336128235, |
| "rewards/conciseness_reward/mean": 9.98863639831543, |
| "rewards/conciseness_reward/std": 0.06428244113922119, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 9.2, |
| "completions/mean_length": 27.825, |
| "completions/mean_terminated_length": 4.833333539962768, |
| "completions/min_length": 8.2, |
| "completions/min_terminated_length": 1.8, |
| "epoch": 0.057717678100263854, |
| "grad_norm": 0.005825403146445751, |
| "kl": 3.1109375, |
| "learning_rate": 0.00019668233886044597, |
| "loss": 0.1245, |
| "num_tokens": 432469.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.85625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 18.6, |
| "completions/mean_length": 28.75625, |
| "completions/mean_terminated_length": 9.920000028610229, |
| "completions/min_length": 2.2, |
| "completions/min_terminated_length": 2.2, |
| "epoch": 0.059366754617414245, |
| "grad_norm": 0.014362619258463383, |
| "kl": 2.59609375, |
| "learning_rate": 0.00019622179935292855, |
| "loss": 0.1039, |
| "num_tokens": 447002.0, |
| "reward": 48.7701904296875, |
| "reward_std": 0.039210596680641176, |
| "rewards/conciseness_reward/mean": 9.994318199157714, |
| "rewards/conciseness_reward/std": 0.03214122056961059, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 0.4, |
| "completions/mean_length": 31.625, |
| "completions/mean_terminated_length": 0.4, |
| "completions/min_length": 26.0, |
| "completions/min_terminated_length": 0.4, |
| "epoch": 0.061015831134564644, |
| "grad_norm": 0.011907841078937054, |
| "kl": 1.64453125, |
| "learning_rate": 0.00019573194975320673, |
| "loss": 0.0658, |
| "num_tokens": 461740.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.99375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 4.6, |
| "completions/mean_length": 31.94375, |
| "completions/mean_terminated_length": 4.6, |
| "completions/min_length": 30.2, |
| "completions/min_terminated_length": 4.6, |
| "epoch": 0.06266490765171503, |
| "grad_norm": 0.002289639785885811, |
| "kl": 1.58515625, |
| "learning_rate": 0.00019521293927421388, |
| "loss": 0.0634, |
| "num_tokens": 476849.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 8.0, |
| "completions/mean_length": 31.85, |
| "completions/mean_terminated_length": 8.0, |
| "completions/min_length": 27.2, |
| "completions/min_terminated_length": 8.0, |
| "epoch": 0.06431398416886544, |
| "grad_norm": 0.01182704046368599, |
| "kl": 1.59765625, |
| "learning_rate": 0.00019466492601156966, |
| "loss": 0.0638, |
| "num_tokens": 492277.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.06596306068601583, |
| "grad_norm": 0.008451790548861027, |
| "learning_rate": 0.00019408807689542257, |
| "loss": 0.0666, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.06596306068601583, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.9807692307692307, |
| "eval_completions/max_length": 32.0, |
| "eval_completions/max_terminated_length": 4.230769230769231, |
| "eval_completions/mean_length": 31.67135989010989, |
| "eval_completions/mean_terminated_length": 3.882783884530539, |
| "eval_completions/min_length": 27.142857142857142, |
| "eval_completions/min_terminated_length": 3.5824175824175826, |
| "eval_kl": 1.6657366071428572, |
| "eval_loss": 0.0666266530752182, |
| "eval_num_tokens": 510020.0, |
| "eval_reward": 48.78877585274832, |
| "eval_reward_std": 0.01292657033427731, |
| "eval_rewards/conciseness_reward/mean": 9.998126878843202, |
| "eval_rewards/conciseness_reward/std": 0.007492508698295761, |
| "eval_rewards/reward_func_correct_answer/mean": 0.0, |
| "eval_rewards/reward_func_correct_answer/std": 0.0, |
| "eval_rewards/reward_func_keywords/mean": 0.0, |
| "eval_rewards/reward_func_keywords/std": 0.0, |
| "eval_runtime": 264.6454, |
| "eval_samples_per_second": 5.498, |
| "eval_steps_per_second": 0.344, |
| "step": 200 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.971875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 12.2, |
| "completions/mean_length": 31.621875, |
| "completions/mean_terminated_length": 10.7, |
| "completions/min_length": 22.0, |
| "completions/min_terminated_length": 9.2, |
| "epoch": 0.06761213720316622, |
| "grad_norm": 0.016916701570153236, |
| "kl": 1.674609375, |
| "learning_rate": 0.00019348256763960145, |
| "loss": 0.0673, |
| "num_tokens": 524948.0, |
| "reward": 48.78405342102051, |
| "reward_std": 0.019605298340320588, |
| "rewards/conciseness_reward/mean": 9.997159099578857, |
| "rewards/conciseness_reward/std": 0.016070610284805296, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 205 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.98125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 8.0, |
| "completions/mean_length": 31.65, |
| "completions/mean_terminated_length": 8.0, |
| "completions/min_length": 20.8, |
| "completions/min_terminated_length": 8.0, |
| "epoch": 0.06926121372031663, |
| "grad_norm": 0.008241601288318634, |
| "kl": 1.65078125, |
| "learning_rate": 0.00019284858268809137, |
| "loss": 0.066, |
| "num_tokens": 540530.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 210 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 8.8, |
| "completions/mean_length": 31.29375, |
| "completions/mean_terminated_length": 7.8, |
| "completions/min_length": 12.8, |
| "completions/min_terminated_length": 6.4, |
| "epoch": 0.07091029023746702, |
| "grad_norm": 0.00787361804395914, |
| "kl": 1.70546875, |
| "learning_rate": 0.00019218631515885006, |
| "loss": 0.0682, |
| "num_tokens": 556357.0, |
| "reward": 48.797916412353516, |
| "reward_std": 0.0, |
| "rewards/conciseness_reward/mean": 10.0, |
| "rewards/conciseness_reward/std": 0.0, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 215 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.98125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 10.2, |
| "completions/mean_length": 31.71875, |
| "completions/mean_terminated_length": 10.2, |
| "completions/min_length": 23.0, |
| "completions/min_terminated_length": 10.2, |
| "epoch": 0.07255936675461741, |
| "grad_norm": 0.02177685871720314, |
| "kl": 1.64609375, |
| "learning_rate": 0.0001914959667849825, |
| "loss": 0.0659, |
| "num_tokens": 572716.0, |
| "reward": 48.7701904296875, |
| "reward_std": 0.039210596680641176, |
| "rewards/conciseness_reward/mean": 9.994318199157714, |
| "rewards/conciseness_reward/std": 0.03214122056961059, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 220 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.95625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 11.2, |
| "completions/mean_length": 31.44375, |
| "completions/mean_terminated_length": 8.083333587646484, |
| "completions/min_length": 24.6, |
| "completions/min_terminated_length": 5.4, |
| "epoch": 0.07420844327176782, |
| "grad_norm": 0.5946508646011353, |
| "kl": 1.86953125, |
| "learning_rate": 0.00019077774785329087, |
| "loss": 0.0748, |
| "num_tokens": 588551.0, |
| "reward": 48.385579681396486, |
| "reward_std": 0.3609386831521988, |
| "rewards/conciseness_reward/mean": 9.91550121307373, |
| "rewards/conciseness_reward/std": 0.1702045440673828, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 225 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.86875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 20.8, |
| "completions/mean_length": 30.075, |
| "completions/mean_terminated_length": 14.540000343322754, |
| "completions/min_length": 15.6, |
| "completions/min_terminated_length": 9.2, |
| "epoch": 0.0758575197889182, |
| "grad_norm": 0.0492733009159565, |
| "kl": 2.21875, |
| "learning_rate": 0.00019003187714021938, |
| "loss": 0.0943, |
| "num_tokens": 604083.0, |
| "reward": 48.42716827392578, |
| "reward_std": 0.44589495956897734, |
| "rewards/conciseness_reward/mean": 9.92402400970459, |
| "rewards/conciseness_reward/std": 0.20466775298118592, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 230 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.68125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 30.4, |
| "completions/mean_length": 27.01875, |
| "completions/mean_terminated_length": 16.836363792419434, |
| "completions/min_length": 3.8, |
| "completions/min_terminated_length": 3.8, |
| "epoch": 0.0775065963060686, |
| "grad_norm": 0.8122158050537109, |
| "kl": 3.3203125, |
| "learning_rate": 0.00018925858184521256, |
| "loss": 0.144, |
| "num_tokens": 618996.0, |
| "reward": 45.936913299560544, |
| "reward_std": 3.0336299002170564, |
| "rewards/conciseness_reward/mean": 9.413703918457031, |
| "rewards/conciseness_reward/std": 0.8155404955148697, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 235 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.51875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 21.4, |
| "completions/mean_length": 20.75, |
| "completions/mean_terminated_length": 8.305263471603393, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.079155672823219, |
| "grad_norm": 0.22038057446479797, |
| "kl": 3.075, |
| "learning_rate": 0.0001884580975215084, |
| "loss": 0.1218, |
| "num_tokens": 633380.0, |
| "reward": 48.46982192993164, |
| "reward_std": 0.46399208903312683, |
| "rewards/conciseness_reward/mean": 9.932765197753906, |
| "rewards/conciseness_reward/std": 0.2160962074995041, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 240 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.66875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 1.4, |
| "completions/mean_length": 21.74375, |
| "completions/mean_terminated_length": 1.0307692289352417, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.0808047493403694, |
| "grad_norm": 0.5326492786407471, |
| "kl": 2.6203125, |
| "learning_rate": 0.00018763066800438636, |
| "loss": 0.1141, |
| "num_tokens": 647049.0, |
| "reward": 48.326927947998044, |
| "reward_std": 0.6660776942968368, |
| "rewards/conciseness_reward/mean": 9.903482246398926, |
| "rewards/conciseness_reward/std": 0.3215636372566223, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 245 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.95, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 2.2, |
| "completions/mean_length": 30.49375, |
| "completions/mean_terminated_length": 1.2666666984558106, |
| "completions/min_length": 7.2, |
| "completions/min_terminated_length": 0.8, |
| "epoch": 0.08245382585751979, |
| "grad_norm": 0.5493065714836121, |
| "kl": 1.91875, |
| "learning_rate": 0.00018677654533689287, |
| "loss": 0.088, |
| "num_tokens": 660962.0, |
| "reward": 46.19136734008789, |
| "reward_std": 3.174171257019043, |
| "rewards/conciseness_reward/mean": 9.465848350524903, |
| "rewards/conciseness_reward/std": 1.1706744194030763, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 250 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.925, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 7.4, |
| "completions/mean_length": 29.89375, |
| "completions/mean_terminated_length": 2.940000057220459, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.08410290237467019, |
| "grad_norm": 0.4117783308029175, |
| "kl": 1.5734375, |
| "learning_rate": 0.00018589598969306645, |
| "loss": 0.0831, |
| "num_tokens": 676141.0, |
| "reward": 45.986488342285156, |
| "reward_std": 3.3635273456573485, |
| "rewards/conciseness_reward/mean": 9.423862838745118, |
| "rewards/conciseness_reward/std": 1.503001594543457, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 255 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.925, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 3.4, |
| "completions/mean_length": 29.75625, |
| "completions/mean_terminated_length": 1.5200000286102295, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.08575197889182058, |
| "grad_norm": 0.5317473411560059, |
| "kl": 1.50234375, |
| "learning_rate": 0.00018498926929868642, |
| "loss": 0.0686, |
| "num_tokens": 691010.0, |
| "reward": 46.40044937133789, |
| "reward_std": 3.155266261100769, |
| "rewards/conciseness_reward/mean": 9.508695220947265, |
| "rewards/conciseness_reward/std": 1.3738978862762452, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 260 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 0.6, |
| "completions/mean_length": 31.03125, |
| "completions/mean_terminated_length": 0.6, |
| "completions/min_length": 13.4, |
| "completions/min_terminated_length": 0.6, |
| "epoch": 0.08740105540897097, |
| "grad_norm": 0.5502648949623108, |
| "kl": 1.578125, |
| "learning_rate": 0.00018405666034956844, |
| "loss": 0.0744, |
| "num_tokens": 704749.0, |
| "reward": 45.646009826660155, |
| "reward_std": 3.507078266143799, |
| "rewards/conciseness_reward/mean": 9.354090118408203, |
| "rewards/conciseness_reward/std": 1.490102195739746, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 265 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.98125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 0.6, |
| "completions/mean_length": 31.41875, |
| "completions/mean_terminated_length": 0.6, |
| "completions/min_length": 13.4, |
| "completions/min_terminated_length": 0.6, |
| "epoch": 0.08905013192612138, |
| "grad_norm": 1.43314790725708, |
| "kl": 3.23203125, |
| "learning_rate": 0.00018309844692743283, |
| "loss": 0.1462, |
| "num_tokens": 722046.0, |
| "reward": 42.508016967773436, |
| "reward_std": 4.8550762176513675, |
| "rewards/conciseness_reward/mean": 8.711030864715577, |
| "rewards/conciseness_reward/std": 1.546305203437805, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 270 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.55, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 27.0, |
| "completions/mean_length": 22.05, |
| "completions/mean_terminated_length": 10.424438858032227, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.09069920844327177, |
| "grad_norm": 1.572669506072998, |
| "kl": 5.246875, |
| "learning_rate": 0.00018211492091337042, |
| "loss": 0.2506, |
| "num_tokens": 735388.0, |
| "reward": 42.087843322753905, |
| "reward_std": 5.907807731628418, |
| "rewards/conciseness_reward/mean": 8.624926567077637, |
| "rewards/conciseness_reward/std": 1.7810691118240356, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 275 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.08125, |
| "completions/max_length": 31.8, |
| "completions/max_terminated_length": 27.2, |
| "completions/mean_length": 7.725, |
| "completions/mean_terminated_length": 5.570255327224731, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.09234828496042216, |
| "grad_norm": 3.4831130504608154, |
| "kl": 10.640625, |
| "learning_rate": 0.00018110638189893267, |
| "loss": 0.7782, |
| "num_tokens": 746406.0, |
| "reward": 42.14253387451172, |
| "reward_std": 7.03149824142456, |
| "rewards/conciseness_reward/mean": 8.636133575439453, |
| "rewards/conciseness_reward/std": 1.9473312377929688, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 280 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.275, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 26.2, |
| "completions/mean_length": 11.4875, |
| "completions/mean_terminated_length": 3.7349055290222166, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.09399736147757255, |
| "grad_norm": 1.5109800100326538, |
| "kl": 7.4125, |
| "learning_rate": 0.00018007313709487334, |
| "loss": 0.6542, |
| "num_tokens": 758456.0, |
| "reward": 36.26386070251465, |
| "reward_std": 9.148859119415283, |
| "rewards/conciseness_reward/mean": 7.4314359664917, |
| "rewards/conciseness_reward/std": 3.2661546230316163, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 285 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 24.6, |
| "completions/mean_length": 8.7125, |
| "completions/mean_terminated_length": 3.357792377471924, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.09564643799472296, |
| "grad_norm": 6.532775402069092, |
| "kl": 11.23125, |
| "learning_rate": 0.00017901550123756906, |
| "loss": 0.9046, |
| "num_tokens": 770046.0, |
| "reward": 40.13367538452148, |
| "reward_std": 9.75949649810791, |
| "rewards/conciseness_reward/mean": 8.224464702606202, |
| "rewards/conciseness_reward/std": 2.8002068042755126, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 290 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 20.2, |
| "completions/mean_length": 6.275, |
| "completions/mean_terminated_length": 2.19633367061615, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.09729551451187335, |
| "grad_norm": 2.8640220165252686, |
| "kl": 9.88125, |
| "learning_rate": 0.00017793379649314744, |
| "loss": 0.6994, |
| "num_tokens": 782316.0, |
| "reward": 42.97630233764649, |
| "reward_std": 5.981427621841431, |
| "rewards/conciseness_reward/mean": 8.806995010375976, |
| "rewards/conciseness_reward/std": 2.3633262157440185, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.09894459102902374, |
| "grad_norm": 2.0157830715179443, |
| "learning_rate": 0.00017682835235935236, |
| "loss": 0.7205, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.09894459102902374, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.2685897435897436, |
| "eval_completions/max_length": 31.82967032967033, |
| "eval_completions/max_terminated_length": 14.214285714285714, |
| "eval_completions/mean_length": 10.580402932324253, |
| "eval_completions/mean_terminated_length": 2.7016365750805362, |
| "eval_completions/min_length": 1.0, |
| "eval_completions/min_terminated_length": 1.0, |
| "eval_kl": 6.191105769230769, |
| "eval_loss": 0.6344618797302246, |
| "eval_num_tokens": 796625.0, |
| "eval_reward": 38.31290377103365, |
| "eval_reward_std": 9.540224014320872, |
| "eval_rewards/conciseness_reward/mean": 7.851340081665542, |
| "eval_rewards/conciseness_reward/std": 3.031103436108474, |
| "eval_rewards/reward_func_correct_answer/mean": 0.0, |
| "eval_rewards/reward_func_correct_answer/std": 0.0, |
| "eval_rewards/reward_func_keywords/mean": 0.0, |
| "eval_rewards/reward_func_keywords/std": 0.0, |
| "eval_runtime": 258.9208, |
| "eval_samples_per_second": 5.619, |
| "eval_steps_per_second": 0.351, |
| "step": 300 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 20.9, |
| "completions/mean_length": 11.653125, |
| "completions/mean_terminated_length": 2.9934032917022706, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10059366754617415, |
| "grad_norm": 2.035731077194214, |
| "kl": 9.2609375, |
| "learning_rate": 0.00017569950556517566, |
| "loss": 0.767, |
| "num_tokens": 809213.0, |
| "reward": 37.08262882232666, |
| "reward_std": 9.928300952911377, |
| "rewards/conciseness_reward/mean": 7.599223709106445, |
| "rewards/conciseness_reward/std": 3.068327784538269, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 305 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 20.6, |
| "completions/mean_length": 10.4125, |
| "completions/mean_terminated_length": 3.0596010208129885, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10224274406332454, |
| "grad_norm": 2.1636366844177246, |
| "kl": 8.353125, |
| "learning_rate": 0.00017454759996828623, |
| "loss": 0.7837, |
| "num_tokens": 821771.0, |
| "reward": 38.445166778564456, |
| "reward_std": 10.677533721923828, |
| "rewards/conciseness_reward/mean": 7.878444194793701, |
| "rewards/conciseness_reward/std": 2.9698015213012696, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 310 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.19375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 18.6, |
| "completions/mean_length": 7.9625, |
| "completions/mean_terminated_length": 2.1980216979980467, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10389182058047493, |
| "grad_norm": 1.422290563583374, |
| "kl": 9.959375, |
| "learning_rate": 0.00017337298645028764, |
| "loss": 0.7172, |
| "num_tokens": 832413.0, |
| "reward": 41.570855712890626, |
| "reward_std": 6.847911691665649, |
| "rewards/conciseness_reward/mean": 8.51898136138916, |
| "rewards/conciseness_reward/std": 2.5959963321685793, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 315 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.24375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 11.6, |
| "completions/mean_length": 9.3375, |
| "completions/mean_terminated_length": 1.976455068588257, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10554089709762533, |
| "grad_norm": 5.7738847732543945, |
| "kl": 11.91875, |
| "learning_rate": 0.00017217602280983623, |
| "loss": 0.9398, |
| "num_tokens": 844013.0, |
| "reward": 40.131536865234374, |
| "reward_std": 10.431174755096436, |
| "rewards/conciseness_reward/mean": 8.224026775360107, |
| "rewards/conciseness_reward/std": 2.8040316104888916, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 320 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 6.0, |
| "completions/mean_length": 8.51875, |
| "completions/mean_terminated_length": 1.1981538534164429, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10718997361477572, |
| "grad_norm": 1.6407678127288818, |
| "kl": 9.725, |
| "learning_rate": 0.0001709570736536521, |
| "loss": 0.7634, |
| "num_tokens": 855598.0, |
| "reward": 40.76496963500976, |
| "reward_std": 8.526632690429688, |
| "rewards/conciseness_reward/mean": 8.353833961486817, |
| "rewards/conciseness_reward/std": 2.905502271652222, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 325 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 12.4, |
| "completions/mean_length": 7.99375, |
| "completions/mean_terminated_length": 1.5209110260009766, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.10883905013192612, |
| "grad_norm": 2.852661609649658, |
| "kl": 11.234375, |
| "learning_rate": 0.00016971651028545648, |
| "loss": 0.8528, |
| "num_tokens": 869583.0, |
| "reward": 40.956661987304685, |
| "reward_std": 9.296725082397462, |
| "rewards/conciseness_reward/mean": 8.393116474151611, |
| "rewards/conciseness_reward/std": 2.911441469192505, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 330 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 17.8, |
| "completions/mean_length": 7.90625, |
| "completions/mean_terminated_length": 1.8667908191680909, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.11048812664907652, |
| "grad_norm": 1.3491684198379517, |
| "kl": 9.65625, |
| "learning_rate": 0.00016845471059286887, |
| "loss": 0.7327, |
| "num_tokens": 882242.0, |
| "reward": 41.10959243774414, |
| "reward_std": 8.142712497711182, |
| "rewards/conciseness_reward/mean": 8.424456214904785, |
| "rewards/conciseness_reward/std": 2.8741564750671387, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 335 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.31875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 16.6, |
| "completions/mean_length": 11.775, |
| "completions/mean_terminated_length": 2.221480059623718, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.11213720316622691, |
| "grad_norm": 2.546013593673706, |
| "kl": 8.3375, |
| "learning_rate": 0.00016717205893229903, |
| "loss": 0.6472, |
| "num_tokens": 894454.0, |
| "reward": 36.69198989868164, |
| "reward_std": 8.701870346069336, |
| "rewards/conciseness_reward/mean": 7.519171237945557, |
| "rewards/conciseness_reward/std": 3.3117987632751467, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 340 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.23125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 13.6, |
| "completions/mean_length": 8.6375, |
| "completions/mean_terminated_length": 1.6123589992523193, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.1137862796833773, |
| "grad_norm": 1.6078628301620483, |
| "kl": 5.08125, |
| "learning_rate": 0.00016586894601186805, |
| "loss": 0.4841, |
| "num_tokens": 907630.0, |
| "reward": 40.832821655273435, |
| "reward_std": 7.08829927444458, |
| "rewards/conciseness_reward/mean": 8.367738628387452, |
| "rewards/conciseness_reward/std": 2.808896017074585, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 345 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 23.2, |
| "completions/mean_length": 17.94375, |
| "completions/mean_terminated_length": 3.825910973548889, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.11543535620052771, |
| "grad_norm": 8.127303123474121, |
| "kl": 16.021875, |
| "learning_rate": 0.00016454576877239507, |
| "loss": 1.0026, |
| "num_tokens": 920553.0, |
| "reward": 31.658840942382813, |
| "reward_std": 11.956652450561524, |
| "rewards/conciseness_reward/mean": 6.487744331359863, |
| "rewards/conciseness_reward/std": 3.047306680679321, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 350 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.49375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 20.6, |
| "completions/mean_length": 17.4375, |
| "completions/mean_terminated_length": 3.490882396697998, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.1170844327176781, |
| "grad_norm": 2.003816843032837, |
| "kl": 5.35625, |
| "learning_rate": 0.0001632029302664851, |
| "loss": 0.5399, |
| "num_tokens": 936001.0, |
| "reward": 30.59487419128418, |
| "reward_std": 10.253981018066407, |
| "rewards/conciseness_reward/mean": 6.269708824157715, |
| "rewards/conciseness_reward/std": 3.309423828125, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 355 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 21.8, |
| "completions/mean_length": 10.33125, |
| "completions/mean_terminated_length": 2.5056591749191286, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.11873350923482849, |
| "grad_norm": 1.3860801458358765, |
| "kl": 6.5125, |
| "learning_rate": 0.0001618408395357554, |
| "loss": 0.6358, |
| "num_tokens": 947848.0, |
| "reward": 37.97140731811523, |
| "reward_std": 9.808005714416504, |
| "rewards/conciseness_reward/mean": 7.781358432769776, |
| "rewards/conciseness_reward/std": 3.182269048690796, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 360 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 19.0, |
| "completions/mean_length": 7.55625, |
| "completions/mean_terminated_length": 1.9193000078201294, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.1203825857519789, |
| "grad_norm": 3.3567373752593994, |
| "kl": 13.759375, |
| "learning_rate": 0.0001604599114862375, |
| "loss": 0.8571, |
| "num_tokens": 959841.0, |
| "reward": 41.46042251586914, |
| "reward_std": 6.935988235473633, |
| "rewards/conciseness_reward/mean": 8.496350860595703, |
| "rewards/conciseness_reward/std": 2.798071002960205, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 365 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25625, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 14.2, |
| "completions/mean_length": 9.6625, |
| "completions/mean_terminated_length": 2.0050908803939818, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.12203166226912929, |
| "grad_norm": 1.5818016529083252, |
| "kl": 8.46875, |
| "learning_rate": 0.00015906056676199255, |
| "loss": 0.7285, |
| "num_tokens": 971895.0, |
| "reward": 38.987307739257815, |
| "reward_std": 9.816894721984863, |
| "rewards/conciseness_reward/mean": 7.989543151855469, |
| "rewards/conciseness_reward/std": 3.1488665103912354, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 370 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 12.8, |
| "completions/mean_length": 8.95, |
| "completions/mean_terminated_length": 1.782608699798584, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.12368073878627968, |
| "grad_norm": 2.568260669708252, |
| "kl": 8.846875, |
| "learning_rate": 0.00015764323161697935, |
| "loss": 0.7342, |
| "num_tokens": 983269.0, |
| "reward": 40.019395446777345, |
| "reward_std": 8.695895671844482, |
| "rewards/conciseness_reward/mean": 8.201046085357666, |
| "rewards/conciseness_reward/std": 2.975964069366455, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 375 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.2375, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 10.0, |
| "completions/mean_length": 8.64375, |
| "completions/mean_terminated_length": 1.3947399377822876, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.12532981530343007, |
| "grad_norm": 1.7566519975662231, |
| "kl": 10.090625, |
| "learning_rate": 0.00015620833778521307, |
| "loss": 0.7109, |
| "num_tokens": 994490.0, |
| "reward": 40.795552825927736, |
| "reward_std": 6.594453907012939, |
| "rewards/conciseness_reward/mean": 8.360101222991943, |
| "rewards/conciseness_reward/std": 2.781657338142395, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 380 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.36875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 1.0, |
| "completions/mean_length": 12.43125, |
| "completions/mean_terminated_length": 1.0, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.12697889182058048, |
| "grad_norm": 1.1844645738601685, |
| "kl": 6.378125, |
| "learning_rate": 0.00015475632234925504, |
| "loss": 0.6129, |
| "num_tokens": 1006117.0, |
| "reward": 36.47860527038574, |
| "reward_std": 11.079174518585205, |
| "rewards/conciseness_reward/mean": 7.475443267822266, |
| "rewards/conciseness_reward/std": 3.2058629512786867, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 385 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5125, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 9.4, |
| "completions/mean_length": 17.15, |
| "completions/mean_terminated_length": 1.519215726852417, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.12862796833773088, |
| "grad_norm": 1.8607726097106934, |
| "kl": 10.7625, |
| "learning_rate": 0.000153287627607073, |
| "loss": 0.8252, |
| "num_tokens": 1019911.0, |
| "reward": 31.58481674194336, |
| "reward_std": 12.696942138671876, |
| "rewards/conciseness_reward/mean": 6.472575092315674, |
| "rewards/conciseness_reward/std": 3.4355133533477784, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.0, |
| "rewards/reward_func_keywords/std": 0.0, |
| "step": 390 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.26875, |
| "completions/max_length": 32.0, |
| "completions/max_terminated_length": 9.2, |
| "completions/mean_length": 9.78125, |
| "completions/mean_terminated_length": 1.5907407760620118, |
| "completions/min_length": 1.0, |
| "completions/min_terminated_length": 1.0, |
| "epoch": 0.13027704485488126, |
| "grad_norm": 1.307873010635376, |
| "kl": 7.2125, |
| "learning_rate": 0.00015180270093731303, |
| "loss": 0.6376, |
| "num_tokens": 1034198.0, |
| "reward": 39.4156982421875, |
| "reward_std": 8.192217206954956, |
| "rewards/conciseness_reward/mean": 8.075745105743408, |
| "rewards/conciseness_reward/std": 3.032507038116455, |
| "rewards/reward_func_correct_answer/mean": 0.0, |
| "rewards/reward_func_correct_answer/std": 0.0, |
| "rewards/reward_func_keywords/mean": 0.002083333395421505, |
| "rewards/reward_func_keywords/std": 0.01178511381149292, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.13192612137203166, |
| "grad_norm": 1.1687965393066406, |
| "learning_rate": 0.00015030199466302353, |
| "loss": 0.5685, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.13192612137203166, |
| "eval_clip_ratio/high_max": 0.0, |
| "eval_clip_ratio/high_mean": 0.0, |
| "eval_clip_ratio/low_mean": 0.0, |
| "eval_clip_ratio/low_min": 0.0, |
| "eval_clip_ratio/region_mean": 0.0, |
| "eval_completions/clipped_ratio": 0.2789835164835165, |
| "eval_completions/max_length": 32.0, |
| "eval_completions/max_terminated_length": 8.082417582417582, |
| "eval_completions/mean_length": 10.17390110204508, |
| "eval_completions/mean_terminated_length": 1.7402353758340354, |
| "eval_completions/min_length": 1.0, |
| "eval_completions/min_terminated_length": 1.0, |
| "eval_kl": 12.785199175824175, |
| "eval_loss": 0.8869044184684753, |
| "eval_num_tokens": 1045628.0, |
| "eval_reward": 39.34467090355171, |
| "eval_reward_std": 9.140103686150614, |
| "eval_rewards/conciseness_reward/mean": 8.06277670441093, |
| "eval_rewards/conciseness_reward/std": 2.9405157920587195, |
| "eval_rewards/reward_func_correct_answer/mean": 0.0, |
| "eval_rewards/reward_func_correct_answer/std": 0.0, |
| "eval_rewards/reward_func_keywords/mean": 0.0, |
| "eval_rewards/reward_func_keywords/std": 0.0, |
| "eval_runtime": 259.9033, |
| "eval_samples_per_second": 5.598, |
| "eval_steps_per_second": 0.35, |
| "step": 400 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 1000, |
| "num_input_tokens_seen": 1045628, |
| "num_train_epochs": 1, |
| "save_steps": 100, |
| "stateful_callbacks": { |
| "EarlyStoppingCallback": { |
| "args": { |
| "early_stopping_patience": 5, |
| "early_stopping_threshold": 0.0001 |
| }, |
| "attributes": { |
| "early_stopping_patience_counter": 0 |
| } |
| }, |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|