Tool-R0-Qwen2.5-3B / trainer_state.json
emrecanacikgoz's picture
Upload folder using huggingface_hub
140d572 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.2,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 237.0,
"completions/max_terminated_length": 237.0,
"completions/mean_length": 114.4375,
"completions/mean_terminated_length": 114.4375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 0.23012111708521843,
"epoch": 0.004,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.32706609795270336,
"learning_rate": 1e-06,
"loss": 0.0,
"num_tokens": 21278.0,
"reward": 1.8497917652130127,
"reward_std": 0.09231126308441162,
"rewards/accuracy_reward_func/mean": 0.871666669845581,
"rewards/accuracy_reward_func/std": 0.21418261528015137,
"rewards/format_reward_func/mean": 0.9781249761581421,
"rewards/format_reward_func/std": 0.1237436830997467,
"step": 1,
"step_time": 23.142175153829157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 175.0,
"completions/max_terminated_length": 175.0,
"completions/mean_length": 95.9375,
"completions/mean_terminated_length": 95.9375,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 0.2443241998553276,
"epoch": 0.008,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.33467634062811474,
"learning_rate": 9.8e-07,
"loss": 0.0,
"num_tokens": 43776.0,
"reward": 1.8129092454910278,
"reward_std": 0.02420501410961151,
"rewards/accuracy_reward_func/mean": 0.8129092454910278,
"rewards/accuracy_reward_func/std": 0.21289166808128357,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 2,
"step_time": 9.088656539097428
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 145.0,
"completions/max_terminated_length": 145.0,
"completions/mean_length": 97.03125,
"completions/mean_terminated_length": 97.03125,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 0.19135062769055367,
"epoch": 0.012,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.4170485616641671,
"learning_rate": 9.6e-07,
"loss": -0.0,
"num_tokens": 70409.0,
"reward": 1.906822919845581,
"reward_std": 0.0494791716337204,
"rewards/accuracy_reward_func/mean": 0.906822919845581,
"rewards/accuracy_reward_func/std": 0.13648581504821777,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 3,
"step_time": 8.136402582749724
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 137.0,
"completions/max_terminated_length": 137.0,
"completions/mean_length": 99.125,
"completions/mean_terminated_length": 99.125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"entropy": 0.25429725274443626,
"epoch": 0.016,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7524404289855315,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"num_tokens": 94161.0,
"reward": 1.7960565090179443,
"reward_std": 0.13303174078464508,
"rewards/accuracy_reward_func/mean": 0.8085565567016602,
"rewards/accuracy_reward_func/std": 0.24971628189086914,
"rewards/format_reward_func/mean": 0.987500011920929,
"rewards/format_reward_func/std": 0.0707106739282608,
"step": 4,
"step_time": 7.919396638870239
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 258.0,
"completions/max_terminated_length": 258.0,
"completions/mean_length": 127.75,
"completions/mean_terminated_length": 127.75,
"completions/min_length": 78.0,
"completions/min_terminated_length": 78.0,
"entropy": 0.24686714261770248,
"epoch": 0.02,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.5703983537961373,
"learning_rate": 9.2e-07,
"loss": -0.0,
"num_tokens": 115581.0,
"reward": 1.9024033546447754,
"reward_std": 0.06382934749126434,
"rewards/accuracy_reward_func/mean": 0.9024032950401306,
"rewards/accuracy_reward_func/std": 0.14118432998657227,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 5,
"step_time": 11.992262025363743
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 145.0,
"completions/max_terminated_length": 145.0,
"completions/mean_length": 92.34375,
"completions/mean_terminated_length": 92.34375,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 0.22791285440325737,
"epoch": 0.024,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.2647928147370579,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 140948.0,
"reward": 1.7881250381469727,
"reward_std": 0.016249999403953552,
"rewards/accuracy_reward_func/mean": 0.7881250381469727,
"rewards/accuracy_reward_func/std": 0.23106878995895386,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 6,
"step_time": 8.167283555492759
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 101.59375,
"completions/mean_terminated_length": 101.59375,
"completions/min_length": 61.0,
"completions/min_terminated_length": 61.0,
"entropy": 0.2525057829916477,
"epoch": 0.028,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.28702743314019674,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"num_tokens": 167411.0,
"reward": 1.9117188453674316,
"reward_std": 0.04828793182969093,
"rewards/accuracy_reward_func/mean": 0.9117187261581421,
"rewards/accuracy_reward_func/std": 0.12177487462759018,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 7,
"step_time": 8.586366776376963
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 101.90625,
"completions/mean_terminated_length": 101.90625,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"entropy": 0.251990407705307,
"epoch": 0.032,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.6162980049635086,
"learning_rate": 8.599999999999999e-07,
"loss": -0.0,
"num_tokens": 192304.0,
"reward": 1.9317708015441895,
"reward_std": 0.0726683959364891,
"rewards/accuracy_reward_func/mean": 0.9317708611488342,
"rewards/accuracy_reward_func/std": 0.12373802810907364,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 8,
"step_time": 10.18786786403507
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 171.0,
"completions/max_terminated_length": 171.0,
"completions/mean_length": 89.53125,
"completions/mean_terminated_length": 89.53125,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"entropy": 0.2736722156405449,
"epoch": 0.036,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.7002448094598603,
"learning_rate": 8.399999999999999e-07,
"loss": -0.0,
"num_tokens": 217177.0,
"reward": 1.8406250476837158,
"reward_std": 0.09107423573732376,
"rewards/accuracy_reward_func/mean": 0.840624988079071,
"rewards/accuracy_reward_func/std": 0.3231492042541504,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 9,
"step_time": 9.146976439282298
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 134.0,
"completions/max_terminated_length": 134.0,
"completions/mean_length": 88.40625,
"completions/mean_terminated_length": 88.40625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"entropy": 0.2195826843380928,
"epoch": 0.04,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.4544179080346874,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0,
"num_tokens": 241142.0,
"reward": 1.9390909671783447,
"reward_std": 0.014433760195970535,
"rewards/accuracy_reward_func/mean": 0.9390908479690552,
"rewards/accuracy_reward_func/std": 0.0804174616932869,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 10,
"step_time": 7.882300075143576
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 111.0,
"completions/max_terminated_length": 111.0,
"completions/mean_length": 87.5625,
"completions/mean_terminated_length": 87.5625,
"completions/min_length": 59.0,
"completions/min_terminated_length": 59.0,
"entropy": 0.19999410584568977,
"epoch": 0.044,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.31705242310143567,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 268372.0,
"reward": 1.909999966621399,
"reward_std": 0.0329379141330719,
"rewards/accuracy_reward_func/mean": 0.9099999666213989,
"rewards/accuracy_reward_func/std": 0.1774914711713791,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 11,
"step_time": 7.217764110304415
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 192.0,
"completions/max_terminated_length": 192.0,
"completions/mean_length": 103.25,
"completions/mean_terminated_length": 103.25,
"completions/min_length": 60.0,
"completions/min_terminated_length": 60.0,
"entropy": 0.2381710633635521,
"epoch": 0.048,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.29876243636744126,
"learning_rate": 7.799999999999999e-07,
"loss": -0.0,
"num_tokens": 286592.0,
"reward": 1.8604166507720947,
"reward_std": 0.11249998956918716,
"rewards/accuracy_reward_func/mean": 0.8604166507720947,
"rewards/accuracy_reward_func/std": 0.31079012155532837,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 12,
"step_time": 9.55866174865514
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 168.0,
"completions/max_terminated_length": 168.0,
"completions/mean_length": 87.6875,
"completions/mean_terminated_length": 87.6875,
"completions/min_length": 53.0,
"completions/min_terminated_length": 53.0,
"entropy": 0.21462798118591309,
"epoch": 0.052,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.7167465527886859,
"learning_rate": 7.599999999999999e-07,
"loss": -0.0,
"num_tokens": 306702.0,
"reward": 1.9366666078567505,
"reward_std": 0.02041665092110634,
"rewards/accuracy_reward_func/mean": 0.9366666674613953,
"rewards/accuracy_reward_func/std": 0.11999402940273285,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 13,
"step_time": 8.825894831679761
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 168.0,
"completions/max_terminated_length": 168.0,
"completions/mean_length": 119.375,
"completions/mean_terminated_length": 119.375,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"entropy": 0.2256241999566555,
"epoch": 0.056,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.5531863595246166,
"learning_rate": 7.4e-07,
"loss": 0.0,
"num_tokens": 333758.0,
"reward": 1.8471875190734863,
"reward_std": 0.14163094758987427,
"rewards/accuracy_reward_func/mean": 0.8471875190734863,
"rewards/accuracy_reward_func/std": 0.26768702268600464,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 14,
"step_time": 8.871076120994985
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 151.0,
"completions/max_terminated_length": 151.0,
"completions/mean_length": 99.125,
"completions/mean_terminated_length": 99.125,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 0.21050135791301727,
"epoch": 0.06,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.5955394753926603,
"learning_rate": 7.2e-07,
"loss": -0.0,
"num_tokens": 363766.0,
"reward": 1.8220758438110352,
"reward_std": 0.18934205174446106,
"rewards/accuracy_reward_func/mean": 0.8533259034156799,
"rewards/accuracy_reward_func/std": 0.2272060364484787,
"rewards/format_reward_func/mean": 0.96875,
"rewards/format_reward_func/std": 0.1767766922712326,
"step": 15,
"step_time": 8.423650750890374
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 94.875,
"completions/mean_terminated_length": 94.875,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.23235702514648438,
"epoch": 0.064,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.31149648184979223,
"learning_rate": 7e-07,
"loss": -0.0,
"num_tokens": 383022.0,
"reward": 1.8937499523162842,
"reward_std": 0.11737333238124847,
"rewards/accuracy_reward_func/mean": 0.9156249761581421,
"rewards/accuracy_reward_func/std": 0.19610625505447388,
"rewards/format_reward_func/mean": 0.9781249761581421,
"rewards/format_reward_func/std": 0.1237436830997467,
"step": 16,
"step_time": 8.814181880094111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 216.0,
"completions/max_terminated_length": 216.0,
"completions/mean_length": 116.03125,
"completions/mean_terminated_length": 116.03125,
"completions/min_length": 77.0,
"completions/min_terminated_length": 77.0,
"entropy": 0.21332164481282234,
"epoch": 0.068,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.6314309586182766,
"learning_rate": 6.800000000000001e-07,
"loss": -0.0,
"num_tokens": 409507.0,
"reward": 1.7729910612106323,
"reward_std": 0.08083245158195496,
"rewards/accuracy_reward_func/mean": 0.7729910612106323,
"rewards/accuracy_reward_func/std": 0.2013828307390213,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 17,
"step_time": 10.377578075043857
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 194.0,
"completions/max_terminated_length": 194.0,
"completions/mean_length": 124.0,
"completions/mean_terminated_length": 124.0,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.3000790849328041,
"epoch": 0.072,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.43916067550283777,
"learning_rate": 6.6e-07,
"loss": -0.0,
"num_tokens": 429839.0,
"reward": 1.7817708253860474,
"reward_std": 0.0970831960439682,
"rewards/accuracy_reward_func/mean": 0.7817708253860474,
"rewards/accuracy_reward_func/std": 0.20934097468852997,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 18,
"step_time": 9.632790027186275
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 180.0,
"completions/max_terminated_length": 180.0,
"completions/mean_length": 113.8125,
"completions/mean_terminated_length": 113.8125,
"completions/min_length": 69.0,
"completions/min_terminated_length": 69.0,
"entropy": 0.24520108476281166,
"epoch": 0.076,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.8224032747868548,
"learning_rate": 6.4e-07,
"loss": -0.0,
"num_tokens": 455401.0,
"reward": 1.8831250667572021,
"reward_std": 0.0729166716337204,
"rewards/accuracy_reward_func/mean": 0.8831250071525574,
"rewards/accuracy_reward_func/std": 0.1986341029405594,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 19,
"step_time": 9.248267728835344
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 161.0,
"completions/max_terminated_length": 161.0,
"completions/mean_length": 94.28125,
"completions/mean_terminated_length": 94.28125,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 0.2287127859890461,
"epoch": 0.08,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.7435614457550869,
"learning_rate": 6.2e-07,
"loss": 0.0,
"num_tokens": 482190.0,
"reward": 1.8910417556762695,
"reward_std": 0.08611349761486053,
"rewards/accuracy_reward_func/mean": 0.8910416960716248,
"rewards/accuracy_reward_func/std": 0.18090461194515228,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 20,
"step_time": 8.669513036496937
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 137.0,
"completions/max_terminated_length": 137.0,
"completions/mean_length": 98.0,
"completions/mean_terminated_length": 98.0,
"completions/min_length": 71.0,
"completions/min_terminated_length": 71.0,
"entropy": 0.23622526600956917,
"epoch": 0.084,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.3037737034258383,
"learning_rate": 6e-07,
"loss": 0.0,
"num_tokens": 504026.0,
"reward": 1.7918750047683716,
"reward_std": 0.10315428674221039,
"rewards/accuracy_reward_func/mean": 0.7918750047683716,
"rewards/accuracy_reward_func/std": 0.3606663942337036,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 21,
"step_time": 8.023445818573236
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 133.0,
"completions/max_terminated_length": 133.0,
"completions/mean_length": 94.4375,
"completions/mean_terminated_length": 94.4375,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 0.21428008005023003,
"epoch": 0.088,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.29203554195255926,
"learning_rate": 5.8e-07,
"loss": 0.0,
"num_tokens": 530080.0,
"reward": 1.9406249523162842,
"reward_std": 0.045683760195970535,
"rewards/accuracy_reward_func/mean": 0.940625011920929,
"rewards/accuracy_reward_func/std": 0.10506334155797958,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 22,
"step_time": 7.778694893233478
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 232.0,
"completions/max_terminated_length": 232.0,
"completions/mean_length": 110.40625,
"completions/mean_terminated_length": 110.40625,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 0.23138786852359772,
"epoch": 0.092,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.19686742995266426,
"learning_rate": 5.6e-07,
"loss": -0.0,
"num_tokens": 558297.0,
"reward": 1.8413751125335693,
"reward_std": 0.0037499964237213135,
"rewards/accuracy_reward_func/mean": 0.8413749933242798,
"rewards/accuracy_reward_func/std": 0.21956580877304077,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 23,
"step_time": 10.919259454123676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 148.0,
"completions/max_terminated_length": 148.0,
"completions/mean_length": 102.59375,
"completions/mean_terminated_length": 102.59375,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 0.18613235652446747,
"epoch": 0.096,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.20640152621893892,
"learning_rate": 5.4e-07,
"loss": -0.0,
"num_tokens": 584700.0,
"reward": 1.8937499523162842,
"reward_std": 0.020833328366279602,
"rewards/accuracy_reward_func/mean": 0.893750011920929,
"rewards/accuracy_reward_func/std": 0.1515599936246872,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 24,
"step_time": 8.58028247859329
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 125.0,
"completions/max_terminated_length": 125.0,
"completions/mean_length": 97.625,
"completions/mean_terminated_length": 97.625,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 0.1680564060807228,
"epoch": 0.1,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.22607092328648917,
"learning_rate": 5.2e-07,
"loss": -0.0,
"num_tokens": 605736.0,
"reward": 1.9614583253860474,
"reward_std": 0.04327813535928726,
"rewards/accuracy_reward_func/mean": 0.9614583253860474,
"rewards/accuracy_reward_func/std": 0.09804884344339371,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 25,
"step_time": 7.494132779538631
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 218.0,
"completions/max_terminated_length": 218.0,
"completions/mean_length": 110.84375,
"completions/mean_terminated_length": 110.84375,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.2523631304502487,
"epoch": 0.104,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.6963240544036335,
"learning_rate": 5e-07,
"loss": -0.0,
"num_tokens": 629803.0,
"reward": 1.722395896911621,
"reward_std": 0.03437499701976776,
"rewards/accuracy_reward_func/mean": 0.7223958373069763,
"rewards/accuracy_reward_func/std": 0.2913203537464142,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 26,
"step_time": 10.387551098130643
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 206.0,
"completions/max_terminated_length": 206.0,
"completions/mean_length": 101.65625,
"completions/mean_terminated_length": 101.65625,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 0.28546470403671265,
"epoch": 0.108,
"frac_reward_zero_std": 0.25,
"grad_norm": 0.8320895685645606,
"learning_rate": 4.8e-07,
"loss": -0.0,
"num_tokens": 655600.0,
"reward": 1.6748958826065063,
"reward_std": 0.21159148216247559,
"rewards/accuracy_reward_func/mean": 0.7405208349227905,
"rewards/accuracy_reward_func/std": 0.3287121653556824,
"rewards/format_reward_func/mean": 0.934374988079071,
"rewards/format_reward_func/std": 0.20730119943618774,
"step": 27,
"step_time": 9.990737781859934
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 321.0,
"completions/max_terminated_length": 321.0,
"completions/mean_length": 121.78125,
"completions/mean_terminated_length": 121.78125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.20365699753165245,
"epoch": 0.112,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.3040693070538053,
"learning_rate": 4.6e-07,
"loss": -0.0,
"num_tokens": 681285.0,
"reward": 1.8820312023162842,
"reward_std": 0.04736516997218132,
"rewards/accuracy_reward_func/mean": 0.882031261920929,
"rewards/accuracy_reward_func/std": 0.16079869866371155,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 28,
"step_time": 13.582923103123903
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 175.0,
"completions/max_terminated_length": 175.0,
"completions/mean_length": 114.3125,
"completions/mean_terminated_length": 114.3125,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 0.22691119089722633,
"epoch": 0.116,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.29754584523046024,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"num_tokens": 700215.0,
"reward": 1.941562533378601,
"reward_std": 0.03998880088329315,
"rewards/accuracy_reward_func/mean": 0.9415625333786011,
"rewards/accuracy_reward_func/std": 0.11359171569347382,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 29,
"step_time": 9.000959642231464
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 144.0,
"completions/max_terminated_length": 144.0,
"completions/mean_length": 98.8125,
"completions/mean_terminated_length": 98.8125,
"completions/min_length": 76.0,
"completions/min_terminated_length": 76.0,
"entropy": 0.19718682020902634,
"epoch": 0.12,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.28356832567704315,
"learning_rate": 4.1999999999999995e-07,
"loss": 0.0,
"num_tokens": 719033.0,
"reward": 1.8567261695861816,
"reward_std": 0.03630475699901581,
"rewards/accuracy_reward_func/mean": 0.8567261695861816,
"rewards/accuracy_reward_func/std": 0.244332417845726,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 30,
"step_time": 8.117130983620882
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 300.0,
"completions/max_terminated_length": 300.0,
"completions/mean_length": 107.0625,
"completions/mean_terminated_length": 107.0625,
"completions/min_length": 67.0,
"completions/min_terminated_length": 67.0,
"entropy": 0.23164699599146843,
"epoch": 0.124,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.3147460209715464,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 742811.0,
"reward": 1.8263542652130127,
"reward_std": 0.07890324294567108,
"rewards/accuracy_reward_func/mean": 0.8263541460037231,
"rewards/accuracy_reward_func/std": 0.2404511272907257,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 31,
"step_time": 13.022676510736346
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 145.0,
"completions/max_terminated_length": 145.0,
"completions/mean_length": 104.28125,
"completions/mean_terminated_length": 104.28125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.2124277576804161,
"epoch": 0.128,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.14656459241245287,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"num_tokens": 767276.0,
"reward": 1.933750033378601,
"reward_std": 0.004330122843384743,
"rewards/accuracy_reward_func/mean": 0.9337500333786011,
"rewards/accuracy_reward_func/std": 0.16721147298812866,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 32,
"step_time": 8.787895078770816
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 163.0,
"completions/max_terminated_length": 163.0,
"completions/mean_length": 99.6875,
"completions/mean_terminated_length": 99.6875,
"completions/min_length": 66.0,
"completions/min_terminated_length": 66.0,
"entropy": 0.18238281086087227,
"epoch": 0.132,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.17864514961842184,
"learning_rate": 3.6e-07,
"loss": 0.0,
"num_tokens": 795462.0,
"reward": 1.957291603088379,
"reward_std": 0.0024056239053606987,
"rewards/accuracy_reward_func/mean": 0.9572916626930237,
"rewards/accuracy_reward_func/std": 0.0789080411195755,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 33,
"step_time": 8.68917733244598
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 105.21875,
"completions/mean_terminated_length": 105.21875,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.20627178624272346,
"epoch": 0.136,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.3530815395243793,
"learning_rate": 3.4000000000000003e-07,
"loss": 0.0,
"num_tokens": 820193.0,
"reward": 1.7807291746139526,
"reward_std": 0.06852563470602036,
"rewards/accuracy_reward_func/mean": 0.7807291746139526,
"rewards/accuracy_reward_func/std": 0.34384265542030334,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 34,
"step_time": 8.745650510303676
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 165.0,
"completions/max_terminated_length": 165.0,
"completions/mean_length": 112.15625,
"completions/mean_terminated_length": 112.15625,
"completions/min_length": 81.0,
"completions/min_terminated_length": 81.0,
"entropy": 0.260306891053915,
"epoch": 0.14,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.5856316529561636,
"learning_rate": 3.2e-07,
"loss": -0.0,
"num_tokens": 844158.0,
"reward": 1.7372127771377563,
"reward_std": 0.0817936509847641,
"rewards/accuracy_reward_func/mean": 0.7372127771377563,
"rewards/accuracy_reward_func/std": 0.2763761281967163,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 35,
"step_time": 8.764568363316357
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 246.0,
"completions/max_terminated_length": 246.0,
"completions/mean_length": 120.65625,
"completions/mean_terminated_length": 120.65625,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"entropy": 0.2644369825720787,
"epoch": 0.144,
"frac_reward_zero_std": 0.375,
"grad_norm": 0.3867881480590169,
"learning_rate": 3e-07,
"loss": -0.0,
"num_tokens": 870035.0,
"reward": 1.723668098449707,
"reward_std": 0.1543687880039215,
"rewards/accuracy_reward_func/mean": 0.7455431222915649,
"rewards/accuracy_reward_func/std": 0.3101375699043274,
"rewards/format_reward_func/mean": 0.9781249761581421,
"rewards/format_reward_func/std": 0.1237436830997467,
"step": 36,
"step_time": 11.264538847841322
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 162.0,
"completions/max_terminated_length": 162.0,
"completions/mean_length": 100.28125,
"completions/mean_terminated_length": 100.28125,
"completions/min_length": 72.0,
"completions/min_terminated_length": 72.0,
"entropy": 0.23801737278699875,
"epoch": 0.148,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.4666251511091792,
"learning_rate": 2.8e-07,
"loss": -0.0,
"num_tokens": 893872.0,
"reward": 1.8420684337615967,
"reward_std": 0.1202840656042099,
"rewards/accuracy_reward_func/mean": 0.8639434576034546,
"rewards/accuracy_reward_func/std": 0.228593647480011,
"rewards/format_reward_func/mean": 0.9781249761581421,
"rewards/format_reward_func/std": 0.1237436830997467,
"step": 37,
"step_time": 8.666291879490018
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 198.0,
"completions/max_terminated_length": 198.0,
"completions/mean_length": 110.9375,
"completions/mean_terminated_length": 110.9375,
"completions/min_length": 73.0,
"completions/min_terminated_length": 73.0,
"entropy": 0.23708894476294518,
"epoch": 0.152,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.3003587942152678,
"learning_rate": 2.6e-07,
"loss": 0.0,
"num_tokens": 919606.0,
"reward": 1.9385044574737549,
"reward_std": 0.02471514418721199,
"rewards/accuracy_reward_func/mean": 0.9385044574737549,
"rewards/accuracy_reward_func/std": 0.0629124566912651,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 38,
"step_time": 9.764362094923854
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 146.0,
"completions/max_terminated_length": 146.0,
"completions/mean_length": 96.75,
"completions/mean_terminated_length": 96.75,
"completions/min_length": 68.0,
"completions/min_terminated_length": 68.0,
"entropy": 0.23468047007918358,
"epoch": 0.156,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.37048827303687887,
"learning_rate": 2.4e-07,
"loss": -0.0,
"num_tokens": 945630.0,
"reward": 1.731874942779541,
"reward_std": 0.1671428233385086,
"rewards/accuracy_reward_func/mean": 0.7756249904632568,
"rewards/accuracy_reward_func/std": 0.3532584309577942,
"rewards/format_reward_func/mean": 0.956250011920929,
"rewards/format_reward_func/std": 0.1721542775630951,
"step": 39,
"step_time": 9.183467078953981
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 173.0,
"completions/max_terminated_length": 173.0,
"completions/mean_length": 109.0,
"completions/mean_terminated_length": 109.0,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.23336144164204597,
"epoch": 0.16,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.287196945879942,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"num_tokens": 967418.0,
"reward": 1.9270832538604736,
"reward_std": 0.012028136290609837,
"rewards/accuracy_reward_func/mean": 0.9270833134651184,
"rewards/accuracy_reward_func/std": 0.1690024584531784,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 40,
"step_time": 9.006164254620671
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 143.0,
"completions/max_terminated_length": 143.0,
"completions/mean_length": 99.28125,
"completions/mean_terminated_length": 99.28125,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.22068660333752632,
"epoch": 0.164,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.26699909188358745,
"learning_rate": 2e-07,
"loss": 0.0,
"num_tokens": 994215.0,
"reward": 1.9018750190734863,
"reward_std": 0.06372595578432083,
"rewards/accuracy_reward_func/mean": 0.9018750190734863,
"rewards/accuracy_reward_func/std": 0.15860149264335632,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 41,
"step_time": 8.13273252826184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 149.0,
"completions/max_terminated_length": 149.0,
"completions/mean_length": 99.625,
"completions/mean_terminated_length": 99.625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.23406217247247696,
"epoch": 0.168,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.3533172869156224,
"learning_rate": 1.8e-07,
"loss": -0.0,
"num_tokens": 1019951.0,
"reward": 1.9318749904632568,
"reward_std": 0.04643829166889191,
"rewards/accuracy_reward_func/mean": 0.9318749904632568,
"rewards/accuracy_reward_func/std": 0.10333744436502457,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 42,
"step_time": 8.258449734188616
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 118.0,
"completions/max_terminated_length": 118.0,
"completions/mean_length": 92.75,
"completions/mean_terminated_length": 92.75,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.22229528427124023,
"epoch": 0.172,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.25935443501058275,
"learning_rate": 1.6e-07,
"loss": 0.0,
"num_tokens": 1047835.0,
"reward": 1.9731919765472412,
"reward_std": 0.022991076111793518,
"rewards/accuracy_reward_func/mean": 0.9731919765472412,
"rewards/accuracy_reward_func/std": 0.05601184815168381,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 43,
"step_time": 7.3312763730064034
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 132.0,
"completions/max_terminated_length": 132.0,
"completions/mean_length": 98.15625,
"completions/mean_terminated_length": 98.15625,
"completions/min_length": 70.0,
"completions/min_terminated_length": 70.0,
"entropy": 0.2684129625558853,
"epoch": 0.176,
"frac_reward_zero_std": 0.75,
"grad_norm": 0.5054404086636916,
"learning_rate": 1.4e-07,
"loss": -0.0,
"num_tokens": 1070652.0,
"reward": 1.816145896911621,
"reward_std": 0.046046242117881775,
"rewards/accuracy_reward_func/mean": 0.8161457777023315,
"rewards/accuracy_reward_func/std": 0.21198345720767975,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 44,
"step_time": 8.090661917813122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 189.0,
"completions/max_terminated_length": 189.0,
"completions/mean_length": 119.96875,
"completions/mean_terminated_length": 119.96875,
"completions/min_length": 63.0,
"completions/min_terminated_length": 63.0,
"entropy": 0.20931534841656685,
"epoch": 0.18,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.4438102774930686,
"learning_rate": 1.2e-07,
"loss": 0.0,
"num_tokens": 1094159.0,
"reward": 1.8079866170883179,
"reward_std": 0.07453451305627823,
"rewards/accuracy_reward_func/mean": 0.8079866170883179,
"rewards/accuracy_reward_func/std": 0.1761476993560791,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 45,
"step_time": 9.527460671961308
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 215.0,
"completions/max_terminated_length": 215.0,
"completions/mean_length": 107.15625,
"completions/mean_terminated_length": 107.15625,
"completions/min_length": 75.0,
"completions/min_terminated_length": 75.0,
"entropy": 0.25717881694436073,
"epoch": 0.184,
"frac_reward_zero_std": 0.875,
"grad_norm": 0.21670389917367197,
"learning_rate": 1e-07,
"loss": 0.0,
"num_tokens": 1122608.0,
"reward": 1.808333396911621,
"reward_std": 0.03125,
"rewards/accuracy_reward_func/mean": 0.8083333373069763,
"rewards/accuracy_reward_func/std": 0.21151866018772125,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 46,
"step_time": 10.36410805862397
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 126.0,
"completions/max_terminated_length": 126.0,
"completions/mean_length": 90.375,
"completions/mean_terminated_length": 90.375,
"completions/min_length": 62.0,
"completions/min_terminated_length": 62.0,
"entropy": 0.21086286380887032,
"epoch": 0.188,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.4243741948522665,
"learning_rate": 8e-08,
"loss": 0.0,
"num_tokens": 1142524.0,
"reward": 1.8813542127609253,
"reward_std": 0.03895833343267441,
"rewards/accuracy_reward_func/mean": 0.8813541531562805,
"rewards/accuracy_reward_func/std": 0.13226144015789032,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 47,
"step_time": 8.480774418450892
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 151.0,
"completions/max_terminated_length": 151.0,
"completions/mean_length": 98.96875,
"completions/mean_terminated_length": 98.96875,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"entropy": 0.25533241406083107,
"epoch": 0.192,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.363734391468509,
"learning_rate": 6e-08,
"loss": 0.0,
"num_tokens": 1167279.0,
"reward": 1.8920758962631226,
"reward_std": 0.043149448931217194,
"rewards/accuracy_reward_func/mean": 0.8920758962631226,
"rewards/accuracy_reward_func/std": 0.18521229922771454,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 48,
"step_time": 8.397036101669073
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 158.0,
"completions/max_terminated_length": 158.0,
"completions/mean_length": 93.71875,
"completions/mean_terminated_length": 93.71875,
"completions/min_length": 55.0,
"completions/min_terminated_length": 55.0,
"entropy": 0.2753983736038208,
"epoch": 0.196,
"frac_reward_zero_std": 0.625,
"grad_norm": 0.37001348543936974,
"learning_rate": 4e-08,
"loss": 0.0,
"num_tokens": 1189970.0,
"reward": 1.7348958253860474,
"reward_std": 0.0466608926653862,
"rewards/accuracy_reward_func/mean": 0.7348958253860474,
"rewards/accuracy_reward_func/std": 0.23915956914424896,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 49,
"step_time": 9.01807147078216
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 685.0,
"completions/max_terminated_length": 685.0,
"completions/mean_length": 130.3125,
"completions/mean_terminated_length": 130.3125,
"completions/min_length": 65.0,
"completions/min_terminated_length": 65.0,
"entropy": 0.271317683160305,
"epoch": 0.2,
"frac_reward_zero_std": 0.5,
"grad_norm": 0.3729527358522691,
"learning_rate": 2e-08,
"loss": 0.0,
"num_tokens": 1214600.0,
"reward": 1.7553727626800537,
"reward_std": 0.13293951749801636,
"rewards/accuracy_reward_func/mean": 0.7553727626800537,
"rewards/accuracy_reward_func/std": 0.3456610441207886,
"rewards/format_reward_func/mean": 1.0,
"rewards/format_reward_func/std": 0.0,
"step": 50,
"step_time": 25.12439160142094
}
],
"logging_steps": 1.0,
"max_steps": 50,
"num_input_tokens_seen": 1214600,
"num_train_epochs": 1,
"save_steps": 500,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 2,
"trial_name": null,
"trial_params": null
}