FormlessAI's picture
Training in progress, epoch 0, checkpoint
b2bceef verified
{
"best_global_step": null,
"best_metric": 0.8869044184684753,
"best_model_checkpoint": null,
"epoch": 0.13192612137203166,
"eval_steps": 100,
"global_step": 400,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 15.6,
"completions/mean_length": 31.29375,
"completions/mean_terminated_length": 13.3,
"completions/min_length": 11.0,
"completions/min_terminated_length": 11.0,
"epoch": 0.0016490765171503958,
"grad_norm": 1.0301371812820435,
"kl": 0.000438690185546875,
"learning_rate": 8.000000000000001e-06,
"loss": 0.02,
"num_tokens": 14709.0,
"reward": 15.14717788696289,
"reward_std": 2.122625803947449,
"rewards/conciseness_reward/mean": 3.1040622711181642,
"rewards/conciseness_reward/std": 1.0407999098300933,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 8.2,
"completions/mean_length": 31.4125,
"completions/mean_terminated_length": 5.0,
"completions/min_length": 19.6,
"completions/min_terminated_length": 0.4,
"epoch": 0.0032981530343007917,
"grad_norm": 1.00534188747406,
"kl": 0.0009120941162109375,
"learning_rate": 1.8e-05,
"loss": 0.0162,
"num_tokens": 31719.0,
"reward": 15.080268859863281,
"reward_std": 1.7717459440231322,
"rewards/conciseness_reward/mean": 3.090350866317749,
"rewards/conciseness_reward/std": 0.7454259812831878,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 11.8,
"completions/mean_length": 31.375,
"completions/mean_terminated_length": 11.8,
"completions/min_length": 18.2,
"completions/min_terminated_length": 11.8,
"epoch": 0.004947229551451188,
"grad_norm": 1.1287697553634644,
"kl": 0.002832794189453125,
"learning_rate": 2.8000000000000003e-05,
"loss": 0.0068,
"num_tokens": 49875.0,
"reward": 15.289332962036132,
"reward_std": 1.7224721908569336,
"rewards/conciseness_reward/mean": 3.133193778991699,
"rewards/conciseness_reward/std": 1.0818009793758392,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 11.4,
"completions/mean_length": 30.39375,
"completions/mean_terminated_length": 7.133333468437195,
"completions/min_length": 11.4,
"completions/min_terminated_length": 5.0,
"epoch": 0.006596306068601583,
"grad_norm": 1.091719627380371,
"kl": 0.01212921142578125,
"learning_rate": 3.8e-05,
"loss": 0.0123,
"num_tokens": 65538.0,
"reward": 16.047473907470703,
"reward_std": 1.4964761018753052,
"rewards/conciseness_reward/mean": 3.288557195663452,
"rewards/conciseness_reward/std": 1.2817005276679994,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.95,
"completions/max_length": 32.0,
"completions/max_terminated_length": 21.6,
"completions/mean_length": 31.35625,
"completions/mean_terminated_length": 20.3,
"completions/min_length": 19.0,
"completions/min_terminated_length": 19.0,
"epoch": 0.008245382585751979,
"grad_norm": 1.1559650897979736,
"kl": 0.0252685546875,
"learning_rate": 4.8e-05,
"loss": 0.0148,
"num_tokens": 81341.0,
"reward": 15.850406646728516,
"reward_std": 1.5481059789657592,
"rewards/conciseness_reward/mean": 3.2481727600097656,
"rewards/conciseness_reward/std": 0.7157365679740906,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.95625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 11.2,
"completions/mean_length": 31.24375,
"completions/mean_terminated_length": 9.4,
"completions/min_length": 19.6,
"completions/min_terminated_length": 6.8,
"epoch": 0.009894459102902375,
"grad_norm": 1.179612636566162,
"kl": 0.041455078125,
"learning_rate": 5.8e-05,
"loss": 0.0168,
"num_tokens": 96376.0,
"reward": 16.42060241699219,
"reward_std": 2.5188846826553344,
"rewards/conciseness_reward/mean": 3.3650211811065676,
"rewards/conciseness_reward/std": 0.9817604303359986,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.94375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 9.6,
"completions/mean_length": 30.7375,
"completions/mean_terminated_length": 5.9333335876464846,
"completions/min_length": 7.2,
"completions/min_terminated_length": 0.8,
"epoch": 0.01154353562005277,
"grad_norm": 1.2418527603149414,
"kl": 0.0891357421875,
"learning_rate": 6.800000000000001e-05,
"loss": 0.0289,
"num_tokens": 110096.0,
"reward": 17.450071144104005,
"reward_std": 2.6924588203430178,
"rewards/conciseness_reward/mean": 3.5759867668151855,
"rewards/conciseness_reward/std": 1.3786604046821593,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.79375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 21.4,
"completions/mean_length": 26.95,
"completions/mean_terminated_length": 8.594264364242553,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.013192612137203167,
"grad_norm": 1.533212661743164,
"kl": 0.17685546875,
"learning_rate": 7.800000000000001e-05,
"loss": 0.0944,
"num_tokens": 125832.0,
"reward": 22.291907501220702,
"reward_std": 5.0477148532867435,
"rewards/conciseness_reward/mean": 4.568208789825439,
"rewards/conciseness_reward/std": 2.234351325035095,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.525,
"completions/max_length": 32.0,
"completions/max_terminated_length": 25.2,
"completions/mean_length": 18.95625,
"completions/mean_terminated_length": 4.880657196044922,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.014841688654353561,
"grad_norm": 2.227647304534912,
"kl": 0.81318359375,
"learning_rate": 8.800000000000001e-05,
"loss": 0.2385,
"num_tokens": 138589.0,
"reward": 31.8361515045166,
"reward_std": 6.978989696502685,
"rewards/conciseness_reward/mean": 6.524080085754394,
"rewards/conciseness_reward/std": 2.7887794971466064,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1375,
"completions/max_length": 26.0,
"completions/max_terminated_length": 16.6,
"completions/mean_length": 6.43125,
"completions/mean_terminated_length": 2.5217425346374513,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.016490765171503958,
"grad_norm": 0.19684094190597534,
"kl": 3.301953125,
"learning_rate": 9.8e-05,
"loss": 0.3463,
"num_tokens": 151914.0,
"reward": 43.79222412109375,
"reward_std": 4.306332683563232,
"rewards/conciseness_reward/mean": 8.974199676513672,
"rewards/conciseness_reward/std": 1.7521500557661056,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 9.0,
"completions/max_terminated_length": 9.0,
"completions/mean_length": 1.31875,
"completions/mean_terminated_length": 1.31875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.018139841688654353,
"grad_norm": 0.013599345460534096,
"kl": 6.190625,
"learning_rate": 0.00010800000000000001,
"loss": 0.3072,
"num_tokens": 163061.0,
"reward": 48.47799987792969,
"reward_std": 0.45243007838726046,
"rewards/conciseness_reward/mean": 9.934440612792969,
"rewards/conciseness_reward/std": 0.28381501138210297,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.01978891820580475,
"grad_norm": 0.0023122939746826887,
"kl": 6.559375,
"learning_rate": 0.000118,
"loss": 0.2624,
"num_tokens": 173693.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.2,
"completions/max_terminated_length": 1.2,
"completions/mean_length": 1.00625,
"completions/mean_terminated_length": 1.00625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.021437994722955146,
"grad_norm": 0.0008344887173734605,
"kl": 28.259375,
"learning_rate": 0.00012800000000000002,
"loss": 1.1298,
"num_tokens": 183802.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.02308707124010554,
"grad_norm": 0.011819743551313877,
"kl": 6.371875,
"learning_rate": 0.000138,
"loss": 0.2552,
"num_tokens": 195132.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.4,
"completions/max_terminated_length": 1.4,
"completions/mean_length": 1.01875,
"completions/mean_terminated_length": 1.01875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.024736147757255935,
"grad_norm": 0.02948692999780178,
"kl": 6.521875,
"learning_rate": 0.000148,
"loss": 0.2609,
"num_tokens": 204273.0,
"reward": 48.74246368408203,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 9.98863639831543,
"rewards/conciseness_reward/std": 0.044715401530265805,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.01875,
"completions/max_length": 21.8,
"completions/max_terminated_length": 6.4,
"completions/mean_length": 1.76875,
"completions/mean_terminated_length": 1.191330623626709,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.026385224274406333,
"grad_norm": 0.6150962114334106,
"kl": 5.475,
"learning_rate": 0.00015800000000000002,
"loss": 0.3071,
"num_tokens": 217108.0,
"reward": 48.03885269165039,
"reward_std": 1.0734764248132707,
"rewards/conciseness_reward/mean": 9.844447708129882,
"rewards/conciseness_reward/std": 0.7437670588493347,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.028034300791556728,
"grad_norm": 0.004543509799987078,
"kl": 6.296875,
"learning_rate": 0.000168,
"loss": 0.2519,
"num_tokens": 228886.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.4,
"completions/max_terminated_length": 1.4,
"completions/mean_length": 1.025,
"completions/mean_terminated_length": 1.025,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.029683377308707123,
"grad_norm": 0.004970578011125326,
"kl": 6.803125,
"learning_rate": 0.00017800000000000002,
"loss": 0.2777,
"num_tokens": 238962.0,
"reward": 48.71935882568359,
"reward_std": 0.11109672784805298,
"rewards/conciseness_reward/mean": 9.983901596069336,
"rewards/conciseness_reward/std": 0.06620492339134217,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.6,
"completions/max_terminated_length": 1.6,
"completions/mean_length": 1.01875,
"completions/mean_terminated_length": 1.01875,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.03133245382585752,
"grad_norm": 1.1012120246887207,
"kl": 6.246875,
"learning_rate": 0.000188,
"loss": 0.2593,
"num_tokens": 250757.0,
"reward": 48.74246444702148,
"reward_std": 0.07842119336128235,
"rewards/conciseness_reward/mean": 9.98863639831543,
"rewards/conciseness_reward/std": 0.06428244113922119,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 95
},
{
"epoch": 0.032981530343007916,
"grad_norm": 0.030409209430217743,
"learning_rate": 0.00019800000000000002,
"loss": 0.2568,
"step": 100
},
{
"epoch": 0.032981530343007916,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.0,
"eval_completions/max_length": 1.0934065934065933,
"eval_completions/max_terminated_length": 1.0934065934065933,
"eval_completions/mean_length": 1.0061813186813187,
"eval_completions/mean_terminated_length": 1.0061813186813187,
"eval_completions/min_length": 1.0,
"eval_completions/min_terminated_length": 1.0,
"eval_kl": 6.581902472527473,
"eval_loss": 0.26390206813812256,
"eval_num_tokens": 260814.0,
"eval_reward": 48.79334613255092,
"eval_reward_std": 0.006463285167138655,
"eval_rewards/conciseness_reward/mean": 9.999063439421601,
"eval_rewards/conciseness_reward/std": 0.0037462543491478804,
"eval_rewards/reward_func_correct_answer/mean": 0.0,
"eval_rewards/reward_func_correct_answer/std": 0.0,
"eval_rewards/reward_func_keywords/mean": 0.0,
"eval_rewards/reward_func_keywords/std": 0.0,
"eval_runtime": 27.0556,
"eval_samples_per_second": 53.778,
"eval_steps_per_second": 3.363,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.2,
"completions/max_terminated_length": 1.2,
"completions/mean_length": 1.00625,
"completions/mean_terminated_length": 1.00625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.034630606860158314,
"grad_norm": 0.0008123432635329664,
"kl": 6.45625,
"learning_rate": 0.00019999025240093044,
"loss": 0.2631,
"num_tokens": 271803.0,
"reward": 48.78405342102051,
"reward_std": 0.019605298340320588,
"rewards/conciseness_reward/mean": 9.997159099578857,
"rewards/conciseness_reward/std": 0.016070610284805296,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.036279683377308705,
"grad_norm": 0.00020162259170319885,
"kl": 6.33125,
"learning_rate": 0.00019995065603657316,
"loss": 0.2533,
"num_tokens": 284121.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0379287598944591,
"grad_norm": 0.00028325500898063183,
"kl": 6.35625,
"learning_rate": 0.0001998806137341434,
"loss": 0.2545,
"num_tokens": 296089.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0395778364116095,
"grad_norm": 0.00010048302647192031,
"kl": 6.546875,
"learning_rate": 0.000199780146829205,
"loss": 0.2619,
"num_tokens": 308593.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04122691292875989,
"grad_norm": 5.4579424613621086e-05,
"kl": 6.2625,
"learning_rate": 0.00019964928592495045,
"loss": 0.2505,
"num_tokens": 319585.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04287598944591029,
"grad_norm": 5.583846723311581e-05,
"kl": 6.171875,
"learning_rate": 0.00019948807088287883,
"loss": 0.2469,
"num_tokens": 330515.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04452506596306069,
"grad_norm": 3.738172017619945e-05,
"kl": 6.175,
"learning_rate": 0.0001992965508106537,
"loss": 0.247,
"num_tokens": 341113.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04617414248021108,
"grad_norm": 3.934466440114193e-05,
"kl": 6.465625,
"learning_rate": 0.00019907478404714436,
"loss": 0.2587,
"num_tokens": 351807.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04782321899736148,
"grad_norm": 0.00020795360615011305,
"kl": 6.440625,
"learning_rate": 0.0001988228381446553,
"loss": 0.2575,
"num_tokens": 362049.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.04947229551451187,
"grad_norm": 3.9276594179682434e-05,
"kl": 6.415625,
"learning_rate": 0.00019854078984834903,
"loss": 0.2569,
"num_tokens": 372893.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.05112137203166227,
"grad_norm": 7.476914470316842e-05,
"kl": 6.3125,
"learning_rate": 0.0001982287250728689,
"loss": 0.2526,
"num_tokens": 383645.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 1.0,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.052770448548812667,
"grad_norm": 0.0009598923497833312,
"kl": 6.41875,
"learning_rate": 0.0001978867388761685,
"loss": 0.257,
"num_tokens": 394303.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 1.2,
"completions/max_terminated_length": 1.2,
"completions/mean_length": 1.00625,
"completions/mean_terminated_length": 1.00625,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.05441952506596306,
"grad_norm": 0.0006444460013881326,
"kl": 6.83125,
"learning_rate": 0.00019751493543055632,
"loss": 0.2732,
"num_tokens": 405606.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 13.8,
"completions/max_terminated_length": 11.2,
"completions/mean_length": 4.44375,
"completions/mean_terminated_length": 2.8541286468505858,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.056068601583113456,
"grad_norm": 0.2734026610851288,
"kl": 6.778125,
"learning_rate": 0.00019711342799096361,
"loss": 0.2783,
"num_tokens": 417897.0,
"reward": 48.74246444702148,
"reward_std": 0.07842119336128235,
"rewards/conciseness_reward/mean": 9.98863639831543,
"rewards/conciseness_reward/std": 0.06428244113922119,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 9.2,
"completions/mean_length": 27.825,
"completions/mean_terminated_length": 4.833333539962768,
"completions/min_length": 8.2,
"completions/min_terminated_length": 1.8,
"epoch": 0.057717678100263854,
"grad_norm": 0.005825403146445751,
"kl": 3.1109375,
"learning_rate": 0.00019668233886044597,
"loss": 0.1245,
"num_tokens": 432469.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.85625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 18.6,
"completions/mean_length": 28.75625,
"completions/mean_terminated_length": 9.920000028610229,
"completions/min_length": 2.2,
"completions/min_terminated_length": 2.2,
"epoch": 0.059366754617414245,
"grad_norm": 0.014362619258463383,
"kl": 2.59609375,
"learning_rate": 0.00019622179935292855,
"loss": 0.1039,
"num_tokens": 447002.0,
"reward": 48.7701904296875,
"reward_std": 0.039210596680641176,
"rewards/conciseness_reward/mean": 9.994318199157714,
"rewards/conciseness_reward/std": 0.03214122056961059,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 0.4,
"completions/mean_length": 31.625,
"completions/mean_terminated_length": 0.4,
"completions/min_length": 26.0,
"completions/min_terminated_length": 0.4,
"epoch": 0.061015831134564644,
"grad_norm": 0.011907841078937054,
"kl": 1.64453125,
"learning_rate": 0.00019573194975320673,
"loss": 0.0658,
"num_tokens": 461740.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.99375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 4.6,
"completions/mean_length": 31.94375,
"completions/mean_terminated_length": 4.6,
"completions/min_length": 30.2,
"completions/min_terminated_length": 4.6,
"epoch": 0.06266490765171503,
"grad_norm": 0.002289639785885811,
"kl": 1.58515625,
"learning_rate": 0.00019521293927421388,
"loss": 0.0634,
"num_tokens": 476849.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 31.85,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 27.2,
"completions/min_terminated_length": 8.0,
"epoch": 0.06431398416886544,
"grad_norm": 0.01182704046368599,
"kl": 1.59765625,
"learning_rate": 0.00019466492601156966,
"loss": 0.0638,
"num_tokens": 492277.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 195
},
{
"epoch": 0.06596306068601583,
"grad_norm": 0.008451790548861027,
"learning_rate": 0.00019408807689542257,
"loss": 0.0666,
"step": 200
},
{
"epoch": 0.06596306068601583,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.9807692307692307,
"eval_completions/max_length": 32.0,
"eval_completions/max_terminated_length": 4.230769230769231,
"eval_completions/mean_length": 31.67135989010989,
"eval_completions/mean_terminated_length": 3.882783884530539,
"eval_completions/min_length": 27.142857142857142,
"eval_completions/min_terminated_length": 3.5824175824175826,
"eval_kl": 1.6657366071428572,
"eval_loss": 0.0666266530752182,
"eval_num_tokens": 510020.0,
"eval_reward": 48.78877585274832,
"eval_reward_std": 0.01292657033427731,
"eval_rewards/conciseness_reward/mean": 9.998126878843202,
"eval_rewards/conciseness_reward/std": 0.007492508698295761,
"eval_rewards/reward_func_correct_answer/mean": 0.0,
"eval_rewards/reward_func_correct_answer/std": 0.0,
"eval_rewards/reward_func_keywords/mean": 0.0,
"eval_rewards/reward_func_keywords/std": 0.0,
"eval_runtime": 264.6454,
"eval_samples_per_second": 5.498,
"eval_steps_per_second": 0.344,
"step": 200
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.971875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 12.2,
"completions/mean_length": 31.621875,
"completions/mean_terminated_length": 10.7,
"completions/min_length": 22.0,
"completions/min_terminated_length": 9.2,
"epoch": 0.06761213720316622,
"grad_norm": 0.016916701570153236,
"kl": 1.674609375,
"learning_rate": 0.00019348256763960145,
"loss": 0.0673,
"num_tokens": 524948.0,
"reward": 48.78405342102051,
"reward_std": 0.019605298340320588,
"rewards/conciseness_reward/mean": 9.997159099578857,
"rewards/conciseness_reward/std": 0.016070610284805296,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 205
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.98125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 8.0,
"completions/mean_length": 31.65,
"completions/mean_terminated_length": 8.0,
"completions/min_length": 20.8,
"completions/min_terminated_length": 8.0,
"epoch": 0.06926121372031663,
"grad_norm": 0.008241601288318634,
"kl": 1.65078125,
"learning_rate": 0.00019284858268809137,
"loss": 0.066,
"num_tokens": 540530.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 210
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 8.8,
"completions/mean_length": 31.29375,
"completions/mean_terminated_length": 7.8,
"completions/min_length": 12.8,
"completions/min_terminated_length": 6.4,
"epoch": 0.07091029023746702,
"grad_norm": 0.00787361804395914,
"kl": 1.70546875,
"learning_rate": 0.00019218631515885006,
"loss": 0.0682,
"num_tokens": 556357.0,
"reward": 48.797916412353516,
"reward_std": 0.0,
"rewards/conciseness_reward/mean": 10.0,
"rewards/conciseness_reward/std": 0.0,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 215
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.98125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 10.2,
"completions/mean_length": 31.71875,
"completions/mean_terminated_length": 10.2,
"completions/min_length": 23.0,
"completions/min_terminated_length": 10.2,
"epoch": 0.07255936675461741,
"grad_norm": 0.02177685871720314,
"kl": 1.64609375,
"learning_rate": 0.0001914959667849825,
"loss": 0.0659,
"num_tokens": 572716.0,
"reward": 48.7701904296875,
"reward_std": 0.039210596680641176,
"rewards/conciseness_reward/mean": 9.994318199157714,
"rewards/conciseness_reward/std": 0.03214122056961059,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 220
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.95625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 11.2,
"completions/mean_length": 31.44375,
"completions/mean_terminated_length": 8.083333587646484,
"completions/min_length": 24.6,
"completions/min_terminated_length": 5.4,
"epoch": 0.07420844327176782,
"grad_norm": 0.5946508646011353,
"kl": 1.86953125,
"learning_rate": 0.00019077774785329087,
"loss": 0.0748,
"num_tokens": 588551.0,
"reward": 48.385579681396486,
"reward_std": 0.3609386831521988,
"rewards/conciseness_reward/mean": 9.91550121307373,
"rewards/conciseness_reward/std": 0.1702045440673828,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 225
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.86875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 20.8,
"completions/mean_length": 30.075,
"completions/mean_terminated_length": 14.540000343322754,
"completions/min_length": 15.6,
"completions/min_terminated_length": 9.2,
"epoch": 0.0758575197889182,
"grad_norm": 0.0492733009159565,
"kl": 2.21875,
"learning_rate": 0.00019003187714021938,
"loss": 0.0943,
"num_tokens": 604083.0,
"reward": 48.42716827392578,
"reward_std": 0.44589495956897734,
"rewards/conciseness_reward/mean": 9.92402400970459,
"rewards/conciseness_reward/std": 0.20466775298118592,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 230
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.68125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 30.4,
"completions/mean_length": 27.01875,
"completions/mean_terminated_length": 16.836363792419434,
"completions/min_length": 3.8,
"completions/min_terminated_length": 3.8,
"epoch": 0.0775065963060686,
"grad_norm": 0.8122158050537109,
"kl": 3.3203125,
"learning_rate": 0.00018925858184521256,
"loss": 0.144,
"num_tokens": 618996.0,
"reward": 45.936913299560544,
"reward_std": 3.0336299002170564,
"rewards/conciseness_reward/mean": 9.413703918457031,
"rewards/conciseness_reward/std": 0.8155404955148697,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 235
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.51875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 21.4,
"completions/mean_length": 20.75,
"completions/mean_terminated_length": 8.305263471603393,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.079155672823219,
"grad_norm": 0.22038057446479797,
"kl": 3.075,
"learning_rate": 0.0001884580975215084,
"loss": 0.1218,
"num_tokens": 633380.0,
"reward": 48.46982192993164,
"reward_std": 0.46399208903312683,
"rewards/conciseness_reward/mean": 9.932765197753906,
"rewards/conciseness_reward/std": 0.2160962074995041,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 240
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.66875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 1.4,
"completions/mean_length": 21.74375,
"completions/mean_terminated_length": 1.0307692289352417,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.0808047493403694,
"grad_norm": 0.5326492786407471,
"kl": 2.6203125,
"learning_rate": 0.00018763066800438636,
"loss": 0.1141,
"num_tokens": 647049.0,
"reward": 48.326927947998044,
"reward_std": 0.6660776942968368,
"rewards/conciseness_reward/mean": 9.903482246398926,
"rewards/conciseness_reward/std": 0.3215636372566223,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 245
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.95,
"completions/max_length": 32.0,
"completions/max_terminated_length": 2.2,
"completions/mean_length": 30.49375,
"completions/mean_terminated_length": 1.2666666984558106,
"completions/min_length": 7.2,
"completions/min_terminated_length": 0.8,
"epoch": 0.08245382585751979,
"grad_norm": 0.5493065714836121,
"kl": 1.91875,
"learning_rate": 0.00018677654533689287,
"loss": 0.088,
"num_tokens": 660962.0,
"reward": 46.19136734008789,
"reward_std": 3.174171257019043,
"rewards/conciseness_reward/mean": 9.465848350524903,
"rewards/conciseness_reward/std": 1.1706744194030763,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 250
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.925,
"completions/max_length": 32.0,
"completions/max_terminated_length": 7.4,
"completions/mean_length": 29.89375,
"completions/mean_terminated_length": 2.940000057220459,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.08410290237467019,
"grad_norm": 0.4117783308029175,
"kl": 1.5734375,
"learning_rate": 0.00018589598969306645,
"loss": 0.0831,
"num_tokens": 676141.0,
"reward": 45.986488342285156,
"reward_std": 3.3635273456573485,
"rewards/conciseness_reward/mean": 9.423862838745118,
"rewards/conciseness_reward/std": 1.503001594543457,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 255
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.925,
"completions/max_length": 32.0,
"completions/max_terminated_length": 3.4,
"completions/mean_length": 29.75625,
"completions/mean_terminated_length": 1.5200000286102295,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.08575197889182058,
"grad_norm": 0.5317473411560059,
"kl": 1.50234375,
"learning_rate": 0.00018498926929868642,
"loss": 0.0686,
"num_tokens": 691010.0,
"reward": 46.40044937133789,
"reward_std": 3.155266261100769,
"rewards/conciseness_reward/mean": 9.508695220947265,
"rewards/conciseness_reward/std": 1.3738978862762452,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 260
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 0.6,
"completions/mean_length": 31.03125,
"completions/mean_terminated_length": 0.6,
"completions/min_length": 13.4,
"completions/min_terminated_length": 0.6,
"epoch": 0.08740105540897097,
"grad_norm": 0.5502648949623108,
"kl": 1.578125,
"learning_rate": 0.00018405666034956844,
"loss": 0.0744,
"num_tokens": 704749.0,
"reward": 45.646009826660155,
"reward_std": 3.507078266143799,
"rewards/conciseness_reward/mean": 9.354090118408203,
"rewards/conciseness_reward/std": 1.490102195739746,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 265
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.98125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 0.6,
"completions/mean_length": 31.41875,
"completions/mean_terminated_length": 0.6,
"completions/min_length": 13.4,
"completions/min_terminated_length": 0.6,
"epoch": 0.08905013192612138,
"grad_norm": 1.43314790725708,
"kl": 3.23203125,
"learning_rate": 0.00018309844692743283,
"loss": 0.1462,
"num_tokens": 722046.0,
"reward": 42.508016967773436,
"reward_std": 4.8550762176513675,
"rewards/conciseness_reward/mean": 8.711030864715577,
"rewards/conciseness_reward/std": 1.546305203437805,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 270
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.55,
"completions/max_length": 32.0,
"completions/max_terminated_length": 27.0,
"completions/mean_length": 22.05,
"completions/mean_terminated_length": 10.424438858032227,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09069920844327177,
"grad_norm": 1.572669506072998,
"kl": 5.246875,
"learning_rate": 0.00018211492091337042,
"loss": 0.2506,
"num_tokens": 735388.0,
"reward": 42.087843322753905,
"reward_std": 5.907807731628418,
"rewards/conciseness_reward/mean": 8.624926567077637,
"rewards/conciseness_reward/std": 1.7810691118240356,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.08125,
"completions/max_length": 31.8,
"completions/max_terminated_length": 27.2,
"completions/mean_length": 7.725,
"completions/mean_terminated_length": 5.570255327224731,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09234828496042216,
"grad_norm": 3.4831130504608154,
"kl": 10.640625,
"learning_rate": 0.00018110638189893267,
"loss": 0.7782,
"num_tokens": 746406.0,
"reward": 42.14253387451172,
"reward_std": 7.03149824142456,
"rewards/conciseness_reward/mean": 8.636133575439453,
"rewards/conciseness_reward/std": 1.9473312377929688,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 280
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.275,
"completions/max_length": 32.0,
"completions/max_terminated_length": 26.2,
"completions/mean_length": 11.4875,
"completions/mean_terminated_length": 3.7349055290222166,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09399736147757255,
"grad_norm": 1.5109800100326538,
"kl": 7.4125,
"learning_rate": 0.00018007313709487334,
"loss": 0.6542,
"num_tokens": 758456.0,
"reward": 36.26386070251465,
"reward_std": 9.148859119415283,
"rewards/conciseness_reward/mean": 7.4314359664917,
"rewards/conciseness_reward/std": 3.2661546230316163,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 285
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 24.6,
"completions/mean_length": 8.7125,
"completions/mean_terminated_length": 3.357792377471924,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09564643799472296,
"grad_norm": 6.532775402069092,
"kl": 11.23125,
"learning_rate": 0.00017901550123756906,
"loss": 0.9046,
"num_tokens": 770046.0,
"reward": 40.13367538452148,
"reward_std": 9.75949649810791,
"rewards/conciseness_reward/mean": 8.224464702606202,
"rewards/conciseness_reward/std": 2.8002068042755126,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 290
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 20.2,
"completions/mean_length": 6.275,
"completions/mean_terminated_length": 2.19633367061615,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.09729551451187335,
"grad_norm": 2.8640220165252686,
"kl": 9.88125,
"learning_rate": 0.00017793379649314744,
"loss": 0.6994,
"num_tokens": 782316.0,
"reward": 42.97630233764649,
"reward_std": 5.981427621841431,
"rewards/conciseness_reward/mean": 8.806995010375976,
"rewards/conciseness_reward/std": 2.3633262157440185,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 295
},
{
"epoch": 0.09894459102902374,
"grad_norm": 2.0157830715179443,
"learning_rate": 0.00017682835235935236,
"loss": 0.7205,
"step": 300
},
{
"epoch": 0.09894459102902374,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.2685897435897436,
"eval_completions/max_length": 31.82967032967033,
"eval_completions/max_terminated_length": 14.214285714285714,
"eval_completions/mean_length": 10.580402932324253,
"eval_completions/mean_terminated_length": 2.7016365750805362,
"eval_completions/min_length": 1.0,
"eval_completions/min_terminated_length": 1.0,
"eval_kl": 6.191105769230769,
"eval_loss": 0.6344618797302246,
"eval_num_tokens": 796625.0,
"eval_reward": 38.31290377103365,
"eval_reward_std": 9.540224014320872,
"eval_rewards/conciseness_reward/mean": 7.851340081665542,
"eval_rewards/conciseness_reward/std": 3.031103436108474,
"eval_rewards/reward_func_correct_answer/mean": 0.0,
"eval_rewards/reward_func_correct_answer/std": 0.0,
"eval_rewards/reward_func_keywords/mean": 0.0,
"eval_rewards/reward_func_keywords/std": 0.0,
"eval_runtime": 258.9208,
"eval_samples_per_second": 5.619,
"eval_steps_per_second": 0.351,
"step": 300
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3,
"completions/max_length": 32.0,
"completions/max_terminated_length": 20.9,
"completions/mean_length": 11.653125,
"completions/mean_terminated_length": 2.9934032917022706,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10059366754617415,
"grad_norm": 2.035731077194214,
"kl": 9.2609375,
"learning_rate": 0.00017569950556517566,
"loss": 0.767,
"num_tokens": 809213.0,
"reward": 37.08262882232666,
"reward_std": 9.928300952911377,
"rewards/conciseness_reward/mean": 7.599223709106445,
"rewards/conciseness_reward/std": 3.068327784538269,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 305
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 20.6,
"completions/mean_length": 10.4125,
"completions/mean_terminated_length": 3.0596010208129885,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10224274406332454,
"grad_norm": 2.1636366844177246,
"kl": 8.353125,
"learning_rate": 0.00017454759996828623,
"loss": 0.7837,
"num_tokens": 821771.0,
"reward": 38.445166778564456,
"reward_std": 10.677533721923828,
"rewards/conciseness_reward/mean": 7.878444194793701,
"rewards/conciseness_reward/std": 2.9698015213012696,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 310
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.19375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 18.6,
"completions/mean_length": 7.9625,
"completions/mean_terminated_length": 2.1980216979980467,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10389182058047493,
"grad_norm": 1.422290563583374,
"kl": 9.959375,
"learning_rate": 0.00017337298645028764,
"loss": 0.7172,
"num_tokens": 832413.0,
"reward": 41.570855712890626,
"reward_std": 6.847911691665649,
"rewards/conciseness_reward/mean": 8.51898136138916,
"rewards/conciseness_reward/std": 2.5959963321685793,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 315
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.24375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 11.6,
"completions/mean_length": 9.3375,
"completions/mean_terminated_length": 1.976455068588257,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10554089709762533,
"grad_norm": 5.7738847732543945,
"kl": 11.91875,
"learning_rate": 0.00017217602280983623,
"loss": 0.9398,
"num_tokens": 844013.0,
"reward": 40.131536865234374,
"reward_std": 10.431174755096436,
"rewards/conciseness_reward/mean": 8.224026775360107,
"rewards/conciseness_reward/std": 2.8040316104888916,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 320
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 6.0,
"completions/mean_length": 8.51875,
"completions/mean_terminated_length": 1.1981538534164429,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10718997361477572,
"grad_norm": 1.6407678127288818,
"kl": 9.725,
"learning_rate": 0.0001709570736536521,
"loss": 0.7634,
"num_tokens": 855598.0,
"reward": 40.76496963500976,
"reward_std": 8.526632690429688,
"rewards/conciseness_reward/mean": 8.353833961486817,
"rewards/conciseness_reward/std": 2.905502271652222,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 325
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 12.4,
"completions/mean_length": 7.99375,
"completions/mean_terminated_length": 1.5209110260009766,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.10883905013192612,
"grad_norm": 2.852661609649658,
"kl": 11.234375,
"learning_rate": 0.00016971651028545648,
"loss": 0.8528,
"num_tokens": 869583.0,
"reward": 40.956661987304685,
"reward_std": 9.296725082397462,
"rewards/conciseness_reward/mean": 8.393116474151611,
"rewards/conciseness_reward/std": 2.911441469192505,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 330
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2,
"completions/max_length": 32.0,
"completions/max_terminated_length": 17.8,
"completions/mean_length": 7.90625,
"completions/mean_terminated_length": 1.8667908191680909,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11048812664907652,
"grad_norm": 1.3491684198379517,
"kl": 9.65625,
"learning_rate": 0.00016845471059286887,
"loss": 0.7327,
"num_tokens": 882242.0,
"reward": 41.10959243774414,
"reward_std": 8.142712497711182,
"rewards/conciseness_reward/mean": 8.424456214904785,
"rewards/conciseness_reward/std": 2.8741564750671387,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 335
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.31875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 16.6,
"completions/mean_length": 11.775,
"completions/mean_terminated_length": 2.221480059623718,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11213720316622691,
"grad_norm": 2.546013593673706,
"kl": 8.3375,
"learning_rate": 0.00016717205893229903,
"loss": 0.6472,
"num_tokens": 894454.0,
"reward": 36.69198989868164,
"reward_std": 8.701870346069336,
"rewards/conciseness_reward/mean": 7.519171237945557,
"rewards/conciseness_reward/std": 3.3117987632751467,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 340
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.23125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 13.6,
"completions/mean_length": 8.6375,
"completions/mean_terminated_length": 1.6123589992523193,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1137862796833773,
"grad_norm": 1.6078628301620483,
"kl": 5.08125,
"learning_rate": 0.00016586894601186805,
"loss": 0.4841,
"num_tokens": 907630.0,
"reward": 40.832821655273435,
"reward_std": 7.08829927444458,
"rewards/conciseness_reward/mean": 8.367738628387452,
"rewards/conciseness_reward/std": 2.808896017074585,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 345
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 32.0,
"completions/max_terminated_length": 23.2,
"completions/mean_length": 17.94375,
"completions/mean_terminated_length": 3.825910973548889,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11543535620052771,
"grad_norm": 8.127303123474121,
"kl": 16.021875,
"learning_rate": 0.00016454576877239507,
"loss": 1.0026,
"num_tokens": 920553.0,
"reward": 31.658840942382813,
"reward_std": 11.956652450561524,
"rewards/conciseness_reward/mean": 6.487744331359863,
"rewards/conciseness_reward/std": 3.047306680679321,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 350
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.49375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 20.6,
"completions/mean_length": 17.4375,
"completions/mean_terminated_length": 3.490882396697998,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1170844327176781,
"grad_norm": 2.003816843032837,
"kl": 5.35625,
"learning_rate": 0.0001632029302664851,
"loss": 0.5399,
"num_tokens": 936001.0,
"reward": 30.59487419128418,
"reward_std": 10.253981018066407,
"rewards/conciseness_reward/mean": 6.269708824157715,
"rewards/conciseness_reward/std": 3.309423828125,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 355
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 21.8,
"completions/mean_length": 10.33125,
"completions/mean_terminated_length": 2.5056591749191286,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.11873350923482849,
"grad_norm": 1.3860801458358765,
"kl": 6.5125,
"learning_rate": 0.0001618408395357554,
"loss": 0.6358,
"num_tokens": 947848.0,
"reward": 37.97140731811523,
"reward_std": 9.808005714416504,
"rewards/conciseness_reward/mean": 7.781358432769776,
"rewards/conciseness_reward/std": 3.182269048690796,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 360
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 19.0,
"completions/mean_length": 7.55625,
"completions/mean_terminated_length": 1.9193000078201294,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.1203825857519789,
"grad_norm": 3.3567373752593994,
"kl": 13.759375,
"learning_rate": 0.0001604599114862375,
"loss": 0.8571,
"num_tokens": 959841.0,
"reward": 41.46042251586914,
"reward_std": 6.935988235473633,
"rewards/conciseness_reward/mean": 8.496350860595703,
"rewards/conciseness_reward/std": 2.798071002960205,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 365
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25625,
"completions/max_length": 32.0,
"completions/max_terminated_length": 14.2,
"completions/mean_length": 9.6625,
"completions/mean_terminated_length": 2.0050908803939818,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12203166226912929,
"grad_norm": 1.5818016529083252,
"kl": 8.46875,
"learning_rate": 0.00015906056676199255,
"loss": 0.7285,
"num_tokens": 971895.0,
"reward": 38.987307739257815,
"reward_std": 9.816894721984863,
"rewards/conciseness_reward/mean": 7.989543151855469,
"rewards/conciseness_reward/std": 3.1488665103912354,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 370
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 12.8,
"completions/mean_length": 8.95,
"completions/mean_terminated_length": 1.782608699798584,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12368073878627968,
"grad_norm": 2.568260669708252,
"kl": 8.846875,
"learning_rate": 0.00015764323161697935,
"loss": 0.7342,
"num_tokens": 983269.0,
"reward": 40.019395446777345,
"reward_std": 8.695895671844482,
"rewards/conciseness_reward/mean": 8.201046085357666,
"rewards/conciseness_reward/std": 2.975964069366455,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 375
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.2375,
"completions/max_length": 32.0,
"completions/max_terminated_length": 10.0,
"completions/mean_length": 8.64375,
"completions/mean_terminated_length": 1.3947399377822876,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12532981530343007,
"grad_norm": 1.7566519975662231,
"kl": 10.090625,
"learning_rate": 0.00015620833778521307,
"loss": 0.7109,
"num_tokens": 994490.0,
"reward": 40.795552825927736,
"reward_std": 6.594453907012939,
"rewards/conciseness_reward/mean": 8.360101222991943,
"rewards/conciseness_reward/std": 2.781657338142395,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 380
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.36875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 1.0,
"completions/mean_length": 12.43125,
"completions/mean_terminated_length": 1.0,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12697889182058048,
"grad_norm": 1.1844645738601685,
"kl": 6.378125,
"learning_rate": 0.00015475632234925504,
"loss": 0.6129,
"num_tokens": 1006117.0,
"reward": 36.47860527038574,
"reward_std": 11.079174518585205,
"rewards/conciseness_reward/mean": 7.475443267822266,
"rewards/conciseness_reward/std": 3.2058629512786867,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 385
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5125,
"completions/max_length": 32.0,
"completions/max_terminated_length": 9.4,
"completions/mean_length": 17.15,
"completions/mean_terminated_length": 1.519215726852417,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.12862796833773088,
"grad_norm": 1.8607726097106934,
"kl": 10.7625,
"learning_rate": 0.000153287627607073,
"loss": 0.8252,
"num_tokens": 1019911.0,
"reward": 31.58481674194336,
"reward_std": 12.696942138671876,
"rewards/conciseness_reward/mean": 6.472575092315674,
"rewards/conciseness_reward/std": 3.4355133533477784,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.0,
"rewards/reward_func_keywords/std": 0.0,
"step": 390
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.26875,
"completions/max_length": 32.0,
"completions/max_terminated_length": 9.2,
"completions/mean_length": 9.78125,
"completions/mean_terminated_length": 1.5907407760620118,
"completions/min_length": 1.0,
"completions/min_terminated_length": 1.0,
"epoch": 0.13027704485488126,
"grad_norm": 1.307873010635376,
"kl": 7.2125,
"learning_rate": 0.00015180270093731303,
"loss": 0.6376,
"num_tokens": 1034198.0,
"reward": 39.4156982421875,
"reward_std": 8.192217206954956,
"rewards/conciseness_reward/mean": 8.075745105743408,
"rewards/conciseness_reward/std": 3.032507038116455,
"rewards/reward_func_correct_answer/mean": 0.0,
"rewards/reward_func_correct_answer/std": 0.0,
"rewards/reward_func_keywords/mean": 0.002083333395421505,
"rewards/reward_func_keywords/std": 0.01178511381149292,
"step": 395
},
{
"epoch": 0.13192612137203166,
"grad_norm": 1.1687965393066406,
"learning_rate": 0.00015030199466302353,
"loss": 0.5685,
"step": 400
},
{
"epoch": 0.13192612137203166,
"eval_clip_ratio/high_max": 0.0,
"eval_clip_ratio/high_mean": 0.0,
"eval_clip_ratio/low_mean": 0.0,
"eval_clip_ratio/low_min": 0.0,
"eval_clip_ratio/region_mean": 0.0,
"eval_completions/clipped_ratio": 0.2789835164835165,
"eval_completions/max_length": 32.0,
"eval_completions/max_terminated_length": 8.082417582417582,
"eval_completions/mean_length": 10.17390110204508,
"eval_completions/mean_terminated_length": 1.7402353758340354,
"eval_completions/min_length": 1.0,
"eval_completions/min_terminated_length": 1.0,
"eval_kl": 12.785199175824175,
"eval_loss": 0.8869044184684753,
"eval_num_tokens": 1045628.0,
"eval_reward": 39.34467090355171,
"eval_reward_std": 9.140103686150614,
"eval_rewards/conciseness_reward/mean": 8.06277670441093,
"eval_rewards/conciseness_reward/std": 2.9405157920587195,
"eval_rewards/reward_func_correct_answer/mean": 0.0,
"eval_rewards/reward_func_correct_answer/std": 0.0,
"eval_rewards/reward_func_keywords/mean": 0.0,
"eval_rewards/reward_func_keywords/std": 0.0,
"eval_runtime": 259.9033,
"eval_samples_per_second": 5.598,
"eval_steps_per_second": 0.35,
"step": 400
}
],
"logging_steps": 5,
"max_steps": 1000,
"num_input_tokens_seen": 1045628,
"num_train_epochs": 1,
"save_steps": 100,
"stateful_callbacks": {
"EarlyStoppingCallback": {
"args": {
"early_stopping_patience": 5,
"early_stopping_threshold": 0.0001
},
"attributes": {
"early_stopping_patience_counter": 0
}
},
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}