TT_L0.2_H0.28_grpo / trainer_state.json
LLucass's picture
Model save
694e1e6 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2544344663619995,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 118418.0,
"reward": 0.17899775505065918,
"reward_std": 0.7650213241577148,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24362485110759735,
"learning_rate": 5e-08,
"loss": -0.0,
"num_tokens": 239748.0,
"reward": 0.3848632574081421,
"reward_std": 0.9111153483390808,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1228.0,
"completions/mean_length": 1952.96875,
"completions/mean_terminated_length": 831.6000366210938,
"completions/min_length": 608.0,
"completions/min_terminated_length": 608.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25938913226127625,
"learning_rate": 1e-07,
"loss": -0.0,
"num_tokens": 375210.0,
"reward": -0.31737297773361206,
"reward_std": 0.40810590982437134,
"rewards/cosine_scaled_reward/mean": -0.20556148886680603,
"rewards/cosine_scaled_reward/std": 0.2044239044189453,
"rewards/format_reward/mean": 0.09375,
"rewards/format_reward/std": 0.29378482699394226,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1567.9375,
"completions/mean_terminated_length": 988.5516967773438,
"completions/min_length": 500.0,
"completions/min_terminated_length": 500.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29626700282096863,
"learning_rate": 1.5e-07,
"loss": 0.0,
"num_tokens": 485366.0,
"reward": 0.1552329957485199,
"reward_std": 0.5780439376831055,
"rewards/cosine_scaled_reward/mean": -0.18800850212574005,
"rewards/cosine_scaled_reward/std": 0.2348431795835495,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1706.0,
"completions/mean_length": 1988.796875,
"completions/mean_terminated_length": 1100.75,
"completions/min_length": 573.0,
"completions/min_terminated_length": 573.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2605815827846527,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 623465.0,
"reward": -0.4418099522590637,
"reward_std": 0.3239253759384155,
"rewards/cosine_scaled_reward/mean": -0.25215497612953186,
"rewards/cosine_scaled_reward/std": 0.1853509098291397,
"rewards/format_reward/mean": 0.0625,
"rewards/format_reward/std": 0.24397502839565277,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1864.1875,
"completions/mean_terminated_length": 871.6000366210938,
"completions/min_length": 561.0,
"completions/min_terminated_length": 561.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.262493759393692,
"learning_rate": 2.5e-07,
"loss": 0.0,
"num_tokens": 754421.0,
"reward": -0.2906607687473297,
"reward_std": 0.34858179092407227,
"rewards/cosine_scaled_reward/mean": -0.22345538437366486,
"rewards/cosine_scaled_reward/std": 0.16744518280029297,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1931.09375,
"completions/mean_terminated_length": 1216.6666259765625,
"completions/min_length": 554.0,
"completions/min_terminated_length": 554.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23469489812850952,
"learning_rate": 3e-07,
"loss": 0.0,
"num_tokens": 888419.0,
"reward": -0.046325311064720154,
"reward_std": 0.5296324491500854,
"rewards/cosine_scaled_reward/mean": -0.14035014808177948,
"rewards/cosine_scaled_reward/std": 0.36545559763908386,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1730.828125,
"completions/mean_terminated_length": 979.631591796875,
"completions/min_length": 281.0,
"completions/min_terminated_length": 281.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22580823302268982,
"learning_rate": 3.5e-07,
"loss": -0.0,
"num_tokens": 1009608.0,
"reward": 0.22049131989479065,
"reward_std": 0.6817946434020996,
"rewards/cosine_scaled_reward/mean": -0.05381683632731438,
"rewards/cosine_scaled_reward/std": 0.44645029306411743,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1868.140625,
"completions/mean_terminated_length": 1225.7857666015625,
"completions/min_length": 892.0,
"completions/min_terminated_length": 892.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26566582918167114,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 1140625.0,
"reward": -0.13664060831069946,
"reward_std": 0.6131436228752136,
"rewards/cosine_scaled_reward/mean": -0.19332030415534973,
"rewards/cosine_scaled_reward/std": 0.30607181787490845,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1456.0,
"completions/mean_length": 1664.890625,
"completions/mean_terminated_length": 757.5263061523438,
"completions/min_length": 411.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26136595010757446,
"learning_rate": 4.5e-07,
"loss": -0.0,
"num_tokens": 1258010.0,
"reward": 0.022913292050361633,
"reward_std": 0.545270562171936,
"rewards/cosine_scaled_reward/mean": -0.1369808465242386,
"rewards/cosine_scaled_reward/std": 0.3200873136520386,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1333.0,
"completions/mean_length": 1986.34375,
"completions/mean_terminated_length": 1061.5,
"completions/min_length": 841.0,
"completions/min_terminated_length": 841.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23980404436588287,
"learning_rate": 5e-07,
"loss": -0.0,
"num_tokens": 1396808.0,
"reward": -0.45354267954826355,
"reward_std": 0.3950403332710266,
"rewards/cosine_scaled_reward/mean": -0.26583385467529297,
"rewards/cosine_scaled_reward/std": 0.16946381330490112,
"rewards/format_reward/mean": 0.078125,
"rewards/format_reward/std": 0.27048972249031067,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1756.453125,
"completions/mean_terminated_length": 1236.7391357421875,
"completions/min_length": 528.0,
"completions/min_terminated_length": 528.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.277899831533432,
"learning_rate": 5.5e-07,
"loss": -0.0,
"num_tokens": 1520165.0,
"reward": 0.1507202684879303,
"reward_std": 0.7362544536590576,
"rewards/cosine_scaled_reward/mean": -0.14338986575603485,
"rewards/cosine_scaled_reward/std": 0.39759454131126404,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1492.0,
"completions/mean_length": 1742.125,
"completions/mean_terminated_length": 896.4705810546875,
"completions/min_length": 532.0,
"completions/min_terminated_length": 532.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25388944149017334,
"learning_rate": 6e-07,
"loss": -0.0,
"num_tokens": 1642701.0,
"reward": 0.02508428692817688,
"reward_std": 0.5804874300956726,
"rewards/cosine_scaled_reward/mean": -0.13589535653591156,
"rewards/cosine_scaled_reward/std": 0.3501027524471283,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1791.6875,
"completions/mean_terminated_length": 1184.631591796875,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26855364441871643,
"learning_rate": 6.5e-07,
"loss": -0.0,
"num_tokens": 1767977.0,
"reward": 0.027098476886749268,
"reward_std": 0.7340880632400513,
"rewards/cosine_scaled_reward/mean": -0.14270076155662537,
"rewards/cosine_scaled_reward/std": 0.36128607392311096,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1875.0,
"completions/mean_length": 1707.828125,
"completions/mean_terminated_length": 902.1578979492188,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2692890167236328,
"learning_rate": 7e-07,
"loss": 0.0,
"num_tokens": 1888198.0,
"reward": 0.24003228545188904,
"reward_std": 0.5003666281700134,
"rewards/cosine_scaled_reward/mean": -0.02842137962579727,
"rewards/cosine_scaled_reward/std": 0.43434321880340576,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1028.0,
"completions/mean_length": 1985.046875,
"completions/mean_terminated_length": 705.0,
"completions/min_length": 463.0,
"completions/min_terminated_length": 463.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24488449096679688,
"learning_rate": 7.5e-07,
"loss": 0.0,
"num_tokens": 2025681.0,
"reward": -0.37671107053756714,
"reward_std": 0.4366358518600464,
"rewards/cosine_scaled_reward/mean": -0.21179303526878357,
"rewards/cosine_scaled_reward/std": 0.22632460296154022,
"rewards/format_reward/mean": 0.046875,
"rewards/format_reward/std": 0.21304203569889069,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1850.0,
"completions/mean_length": 1561.9375,
"completions/mean_terminated_length": 975.3103637695312,
"completions/min_length": 347.0,
"completions/min_terminated_length": 347.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3149985373020172,
"learning_rate": 8e-07,
"loss": 0.0,
"num_tokens": 2136165.0,
"reward": 0.12802264094352722,
"reward_std": 0.6542905569076538,
"rewards/cosine_scaled_reward/mean": -0.1703636795282364,
"rewards/cosine_scaled_reward/std": 0.3502788841724396,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1837.484375,
"completions/mean_terminated_length": 1085.6429443359375,
"completions/min_length": 574.0,
"completions/min_terminated_length": 574.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20620153844356537,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0,
"num_tokens": 2264140.0,
"reward": -0.041578881442546844,
"reward_std": 0.7910969853401184,
"rewards/cosine_scaled_reward/mean": -0.16922692954540253,
"rewards/cosine_scaled_reward/std": 0.33054032921791077,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 1864.265625,
"completions/mean_terminated_length": 1143.4615478515625,
"completions/min_length": 605.0,
"completions/min_terminated_length": 605.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2673085629940033,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 2394709.0,
"reward": 0.21812058985233307,
"reward_std": 0.8157521486282349,
"rewards/cosine_scaled_reward/mean": -0.02375221811234951,
"rewards/cosine_scaled_reward/std": 0.44612905383110046,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 1601.125,
"completions/mean_terminated_length": 856.3333740234375,
"completions/min_length": 437.0,
"completions/min_terminated_length": 437.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27571871876716614,
"learning_rate": 9.499999999999999e-07,
"loss": -0.0,
"num_tokens": 2508533.0,
"reward": 0.13714352250099182,
"reward_std": 0.5742913484573364,
"rewards/cosine_scaled_reward/mean": -0.1267407238483429,
"rewards/cosine_scaled_reward/std": 0.379833847284317,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 1720.75,
"completions/mean_terminated_length": 945.6842041015625,
"completions/min_length": 260.0,
"completions/min_terminated_length": 260.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3101024329662323,
"learning_rate": 1e-06,
"loss": -0.0,
"num_tokens": 2629469.0,
"reward": 0.0758291482925415,
"reward_std": 0.5849478840827942,
"rewards/cosine_scaled_reward/mean": -0.13396042585372925,
"rewards/cosine_scaled_reward/std": 0.3641633689403534,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1930.0,
"completions/mean_length": 1310.1875,
"completions/mean_terminated_length": 805.368408203125,
"completions/min_length": 262.0,
"completions/min_terminated_length": 262.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35955631732940674,
"learning_rate": 9.99931462820376e-07,
"loss": 0.0,
"num_tokens": 2722337.0,
"reward": 0.5670604705810547,
"reward_std": 0.5711978077888489,
"rewards/cosine_scaled_reward/mean": -0.03678226098418236,
"rewards/cosine_scaled_reward/std": 0.4319343566894531,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1702.0,
"completions/mean_length": 1607.890625,
"completions/mean_terminated_length": 1004.7777709960938,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27159449458122253,
"learning_rate": 9.997258721585931e-07,
"loss": -0.0,
"num_tokens": 2836034.0,
"reward": 0.20600585639476776,
"reward_std": 0.6732993721961975,
"rewards/cosine_scaled_reward/mean": -0.13137206435203552,
"rewards/cosine_scaled_reward/std": 0.38508084416389465,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1755.890625,
"completions/mean_terminated_length": 1113.25,
"completions/min_length": 416.0,
"completions/min_terminated_length": 416.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24094589054584503,
"learning_rate": 9.993832906395582e-07,
"loss": -0.0,
"num_tokens": 2959339.0,
"reward": 0.05567874014377594,
"reward_std": 0.7204875349998474,
"rewards/cosine_scaled_reward/mean": -0.15966063737869263,
"rewards/cosine_scaled_reward/std": 0.3462846875190735,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1870.0,
"completions/mean_length": 1761.09375,
"completions/mean_terminated_length": 1027.888916015625,
"completions/min_length": 562.0,
"completions/min_terminated_length": 562.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2641579508781433,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0,
"num_tokens": 3082345.0,
"reward": 0.12583430111408234,
"reward_std": 0.7026749849319458,
"rewards/cosine_scaled_reward/mean": -0.10114534199237823,
"rewards/cosine_scaled_reward/std": 0.3608616590499878,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 1970.84375,
"completions/mean_terminated_length": 1636.5,
"completions/min_length": 975.0,
"completions/min_terminated_length": 975.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24037978053092957,
"learning_rate": 9.982876141412855e-07,
"loss": 0.0,
"num_tokens": 3219111.0,
"reward": 0.21426932513713837,
"reward_std": 0.740675687789917,
"rewards/cosine_scaled_reward/mean": -0.06474034488201141,
"rewards/cosine_scaled_reward/std": 0.3838227093219757,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 1945.515625,
"completions/mean_terminated_length": 1319.2222900390625,
"completions/min_length": 575.0,
"completions/min_terminated_length": 575.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24205996096134186,
"learning_rate": 9.975348529157229e-07,
"loss": -0.0,
"num_tokens": 3353912.0,
"reward": -0.21838442981243134,
"reward_std": 0.619316577911377,
"rewards/cosine_scaled_reward/mean": -0.19512970745563507,
"rewards/cosine_scaled_reward/std": 0.2882457375526428,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1776.484375,
"completions/mean_terminated_length": 1133.4210205078125,
"completions/min_length": 519.0,
"completions/min_terminated_length": 519.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.261581152677536,
"learning_rate": 9.96645768238595e-07,
"loss": 0.0,
"num_tokens": 3477943.0,
"reward": 0.2565116286277771,
"reward_std": 0.8822247385978699,
"rewards/cosine_scaled_reward/mean": -0.06705668568611145,
"rewards/cosine_scaled_reward/std": 0.47824493050575256,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1232.0,
"completions/mean_length": 1901.6875,
"completions/mean_terminated_length": 877.5,
"completions/min_length": 621.0,
"completions/min_terminated_length": 621.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2625565528869629,
"learning_rate": 9.956206309337066e-07,
"loss": -0.0,
"num_tokens": 3610123.0,
"reward": -0.3446740508079529,
"reward_std": 0.3587799668312073,
"rewards/cosine_scaled_reward/mean": -0.24264952540397644,
"rewards/cosine_scaled_reward/std": 0.16127170622348785,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1845.828125,
"completions/mean_terminated_length": 1123.7857666015625,
"completions/min_length": 789.0,
"completions/min_terminated_length": 789.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2583629786968231,
"learning_rate": 9.944597532678119e-07,
"loss": 0.0,
"num_tokens": 3738792.0,
"reward": -0.13950452208518982,
"reward_std": 0.5518099069595337,
"rewards/cosine_scaled_reward/mean": -0.1869397610425949,
"rewards/cosine_scaled_reward/std": 0.2614031732082367,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1753.0,
"completions/mean_length": 1823.40625,
"completions/mean_terminated_length": 942.3077392578125,
"completions/min_length": 232.0,
"completions/min_terminated_length": 232.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25341325998306274,
"learning_rate": 9.931634888554935e-07,
"loss": -0.0,
"num_tokens": 3865986.0,
"reward": -0.20477020740509033,
"reward_std": 0.6390085220336914,
"rewards/cosine_scaled_reward/mean": -0.20394760370254517,
"rewards/cosine_scaled_reward/std": 0.3794066309928894,
"rewards/format_reward/mean": 0.203125,
"rewards/format_reward/std": 0.40550529956817627,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 1889.484375,
"completions/mean_terminated_length": 1323.357177734375,
"completions/min_length": 714.0,
"completions/min_terminated_length": 714.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23001989722251892,
"learning_rate": 9.917322325514487e-07,
"loss": -0.0,
"num_tokens": 3997265.0,
"reward": 0.01399039477109909,
"reward_std": 0.47122400999069214,
"rewards/cosine_scaled_reward/mean": -0.11800480633974075,
"rewards/cosine_scaled_reward/std": 0.4542357921600342,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1654.0,
"completions/mean_length": 1917.125,
"completions/mean_terminated_length": 1210.4000244140625,
"completions/min_length": 914.0,
"completions/min_terminated_length": 914.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21402673423290253,
"learning_rate": 9.901664203302124e-07,
"loss": -0.0,
"num_tokens": 4131177.0,
"reward": -0.43841344118118286,
"reward_std": 0.3294987678527832,
"rewards/cosine_scaled_reward/mean": -0.29733169078826904,
"rewards/cosine_scaled_reward/std": 0.19245299696922302,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 1486.53125,
"completions/mean_terminated_length": 925.0625,
"completions/min_length": 551.0,
"completions/min_terminated_length": 551.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2830573320388794,
"learning_rate": 9.88466529153356e-07,
"loss": -0.0,
"num_tokens": 4235867.0,
"reward": 0.4512444734573364,
"reward_std": 0.8406625986099243,
"rewards/cosine_scaled_reward/mean": -0.04000277444720268,
"rewards/cosine_scaled_reward/std": 0.49787506461143494,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1168.0,
"completions/mean_length": 1839.96875,
"completions/mean_terminated_length": 568.6666870117188,
"completions/min_length": 357.0,
"completions/min_terminated_length": 357.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2852117717266083,
"learning_rate": 9.866330768241983e-07,
"loss": -0.0,
"num_tokens": 4365121.0,
"reward": -0.1632520854473114,
"reward_std": 0.6035048961639404,
"rewards/cosine_scaled_reward/mean": -0.1675635278224945,
"rewards/cosine_scaled_reward/std": 0.38546639680862427,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 1921.5625,
"completions/mean_terminated_length": 1148.888916015625,
"completions/min_length": 699.0,
"completions/min_terminated_length": 699.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27304011583328247,
"learning_rate": 9.846666218300807e-07,
"loss": 0.0,
"num_tokens": 4499213.0,
"reward": -0.2672756016254425,
"reward_std": 0.45214492082595825,
"rewards/cosine_scaled_reward/mean": -0.20395030081272125,
"rewards/cosine_scaled_reward/std": 0.24503158032894135,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1805.0,
"completions/mean_length": 1901.515625,
"completions/mean_terminated_length": 1006.3333129882812,
"completions/min_length": 589.0,
"completions/min_terminated_length": 589.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26615455746650696,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0,
"num_tokens": 4631934.0,
"reward": -0.4167596101760864,
"reward_std": 0.4093248248100281,
"rewards/cosine_scaled_reward/mean": -0.2786923050880432,
"rewards/cosine_scaled_reward/std": 0.16612833738327026,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1223.0,
"completions/mean_length": 1867.5625,
"completions/mean_terminated_length": 998.1818237304688,
"completions/min_length": 677.0,
"completions/min_terminated_length": 677.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2513941526412964,
"learning_rate": 9.80337140183366e-07,
"loss": 0.0,
"num_tokens": 4763170.0,
"reward": -0.10445012152194977,
"reward_std": 0.42142462730407715,
"rewards/cosine_scaled_reward/mean": -0.1381625533103943,
"rewards/cosine_scaled_reward/std": 0.32096728682518005,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1940.0,
"completions/mean_length": 1663.828125,
"completions/mean_terminated_length": 818.6500244140625,
"completions/min_length": 312.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2877403199672699,
"learning_rate": 9.779754323328192e-07,
"loss": -0.0,
"num_tokens": 4880439.0,
"reward": 0.3092009425163269,
"reward_std": 0.5055705308914185,
"rewards/cosine_scaled_reward/mean": -0.040712013840675354,
"rewards/cosine_scaled_reward/std": 0.3459153175354004,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1534.296875,
"completions/mean_terminated_length": 987.4515991210938,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26293206214904785,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0,
"num_tokens": 4988706.0,
"reward": 0.37637200951576233,
"reward_std": 0.5428045392036438,
"rewards/cosine_scaled_reward/mean": -0.06962649524211884,
"rewards/cosine_scaled_reward/std": 0.44194599986076355,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.5037065148353577,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 1786.28125,
"completions/mean_terminated_length": 1250.3809814453125,
"completions/min_length": 738.0,
"completions/min_terminated_length": 738.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2139737904071808,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0,
"num_tokens": 5114180.0,
"reward": 0.01875646412372589,
"reward_std": 0.6959635019302368,
"rewards/cosine_scaled_reward/mean": -0.17812177538871765,
"rewards/cosine_scaled_reward/std": 0.34367337822914124,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1828.0,
"completions/mean_length": 1684.140625,
"completions/mean_terminated_length": 592.5625,
"completions/min_length": 182.0,
"completions/min_terminated_length": 182.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39831504225730896,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0,
"num_tokens": 5232325.0,
"reward": -0.21632033586502075,
"reward_std": 0.3267907500267029,
"rewards/cosine_scaled_reward/mean": -0.24097268283367157,
"rewards/cosine_scaled_reward/std": 0.17323769629001617,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 1685.078125,
"completions/mean_terminated_length": 992.227294921875,
"completions/min_length": 534.0,
"completions/min_terminated_length": 534.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25143784284591675,
"learning_rate": 9.672327345550543e-07,
"loss": -0.0,
"num_tokens": 5351786.0,
"reward": 0.16303199529647827,
"reward_std": 0.48110607266426086,
"rewards/cosine_scaled_reward/mean": -0.09035900980234146,
"rewards/cosine_scaled_reward/std": 0.3455837368965149,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1377.0,
"completions/mean_length": 1539.421875,
"completions/mean_terminated_length": 796.1154174804688,
"completions/min_length": 289.0,
"completions/min_terminated_length": 289.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3059767186641693,
"learning_rate": 9.64227184053598e-07,
"loss": -0.0,
"num_tokens": 5461005.0,
"reward": 0.5107974410057068,
"reward_std": 0.6938745379447937,
"rewards/cosine_scaled_reward/mean": 0.04446123540401459,
"rewards/cosine_scaled_reward/std": 0.5113232135772705,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1688.0,
"completions/mean_length": 1997.90625,
"completions/mean_terminated_length": 1513.666748046875,
"completions/min_length": 1198.0,
"completions/min_terminated_length": 1198.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21965721249580383,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0,
"num_tokens": 5600527.0,
"reward": -0.0863756537437439,
"reward_std": 0.5902912020683289,
"rewards/cosine_scaled_reward/mean": -0.12131282687187195,
"rewards/cosine_scaled_reward/std": 0.3591388165950775,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 1704.421875,
"completions/mean_terminated_length": 948.5499877929688,
"completions/min_length": 535.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30566513538360596,
"learning_rate": 9.578385041664925e-07,
"loss": 0.0,
"num_tokens": 5720778.0,
"reward": -0.10181278735399246,
"reward_std": 0.5302228927612305,
"rewards/cosine_scaled_reward/mean": -0.21496888995170593,
"rewards/cosine_scaled_reward/std": 0.2217058539390564,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1668.171875,
"completions/mean_terminated_length": 991.0869750976562,
"completions/min_length": 368.0,
"completions/min_terminated_length": 368.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3012264668941498,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0,
"num_tokens": 5837941.0,
"reward": 0.1859496831893921,
"reward_std": 0.9643809795379639,
"rewards/cosine_scaled_reward/mean": -0.11796265840530396,
"rewards/cosine_scaled_reward/std": 0.45073381066322327,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1945.0,
"completions/mean_length": 1640.0,
"completions/mean_terminated_length": 1003.5199584960938,
"completions/min_length": 441.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23681208491325378,
"learning_rate": 9.509529358847654e-07,
"loss": 0.0,
"num_tokens": 5953445.0,
"reward": 0.13163542747497559,
"reward_std": 0.6663622856140137,
"rewards/cosine_scaled_reward/mean": -0.1294947862625122,
"rewards/cosine_scaled_reward/std": 0.3682635426521301,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1542.0,
"completions/mean_length": 1461.109375,
"completions/mean_terminated_length": 795.9667358398438,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3055432438850403,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0,
"num_tokens": 6057020.0,
"reward": 0.18277686834335327,
"reward_std": 0.6457837820053101,
"rewards/cosine_scaled_reward/mean": -0.14298656582832336,
"rewards/cosine_scaled_reward/std": 0.33668506145477295,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 1671.09375,
"completions/mean_terminated_length": 951.5454711914062,
"completions/min_length": 342.0,
"completions/min_terminated_length": 342.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2442624568939209,
"learning_rate": 9.43578868212728e-07,
"loss": 0.0,
"num_tokens": 6174786.0,
"reward": 0.2543642520904541,
"reward_std": 0.6998432874679565,
"rewards/cosine_scaled_reward/mean": -0.08375539630651474,
"rewards/cosine_scaled_reward/std": 0.4246826469898224,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1512.0,
"completions/mean_length": 1313.65625,
"completions/mean_terminated_length": 777.7838134765625,
"completions/min_length": 360.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.05828571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3034026622772217,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0,
"num_tokens": 6269068.0,
"reward": 0.17114153504371643,
"reward_std": 0.5826554298400879,
"rewards/cosine_scaled_reward/mean": -0.21911674737930298,
"rewards/cosine_scaled_reward/std": 0.2822759747505188,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1453.0,
"completions/mean_length": 1678.28125,
"completions/mean_terminated_length": 733.4444580078125,
"completions/min_length": 235.0,
"completions/min_terminated_length": 235.0,
"epoch": 0.05942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2994474470615387,
"learning_rate": 9.357252853159505e-07,
"loss": -0.0,
"num_tokens": 6387830.0,
"reward": -0.17168548703193665,
"reward_std": 0.49792978167533875,
"rewards/cosine_scaled_reward/mean": -0.22646775841712952,
"rewards/cosine_scaled_reward/std": 0.34715980291366577,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1524.0,
"completions/mean_length": 1553.984375,
"completions/mean_terminated_length": 831.9615478515625,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.060571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25160443782806396,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0,
"num_tokens": 6498117.0,
"reward": 0.1639135181903839,
"reward_std": 0.668002724647522,
"rewards/cosine_scaled_reward/mean": -0.14460574090480804,
"rewards/cosine_scaled_reward/std": 0.32541966438293457,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1585.90625,
"completions/mean_terminated_length": 1062.2000732421875,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.061714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2538788914680481,
"learning_rate": 9.274017555754407e-07,
"loss": -0.0,
"num_tokens": 6610759.0,
"reward": 0.7184321880340576,
"reward_std": 1.0729029178619385,
"rewards/cosine_scaled_reward/mean": 0.07015358656644821,
"rewards/cosine_scaled_reward/std": 0.5069921016693115,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1815.0,
"completions/mean_length": 1486.75,
"completions/mean_terminated_length": 850.6666870117188,
"completions/min_length": 365.0,
"completions/min_terminated_length": 365.0,
"epoch": 0.06285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28393346071243286,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0,
"num_tokens": 6716199.0,
"reward": 0.38126039505004883,
"reward_std": 0.6288601160049438,
"rewards/cosine_scaled_reward/mean": -0.08280730992555618,
"rewards/cosine_scaled_reward/std": 0.434533029794693,
"rewards/format_reward/mean": 0.546875,
"rewards/format_reward/std": 0.501733124256134,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 1668.0,
"completions/mean_terminated_length": 1147.25927734375,
"completions/min_length": 570.0,
"completions/min_terminated_length": 570.0,
"epoch": 0.064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24122385680675507,
"learning_rate": 9.186184199300463e-07,
"loss": -0.0,
"num_tokens": 6833911.0,
"reward": 0.24217453598976135,
"reward_std": 0.576280951499939,
"rewards/cosine_scaled_reward/mean": -0.11328773200511932,
"rewards/cosine_scaled_reward/std": 0.43596696853637695,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2008.0,
"completions/mean_length": 1713.359375,
"completions/mean_terminated_length": 1191.3199462890625,
"completions/min_length": 677.0,
"completions/min_terminated_length": 677.0,
"epoch": 0.06514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2275950163602829,
"learning_rate": 9.140576474687263e-07,
"loss": -0.0,
"num_tokens": 6955134.0,
"reward": -0.09758470952510834,
"reward_std": 0.5638470649719238,
"rewards/cosine_scaled_reward/mean": -0.29097986221313477,
"rewards/cosine_scaled_reward/std": 0.2019655853509903,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.5037065148353577,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 1325.90625,
"completions/mean_terminated_length": 973.2557983398438,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.06628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.270525723695755,
"learning_rate": 9.093859795212817e-07,
"loss": -0.0,
"num_tokens": 7050088.0,
"reward": 0.5497192144393921,
"reward_std": 0.8806554675102234,
"rewards/cosine_scaled_reward/mean": -0.07670287787914276,
"rewards/cosine_scaled_reward/std": 0.48966917395591736,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1863.0,
"completions/mean_length": 1507.9375,
"completions/mean_terminated_length": 1138.4210205078125,
"completions/min_length": 678.0,
"completions/min_terminated_length": 678.0,
"epoch": 0.06742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24991631507873535,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0,
"num_tokens": 7157060.0,
"reward": 0.47330179810523987,
"reward_std": 0.6620825529098511,
"rewards/cosine_scaled_reward/mean": -0.07584910094738007,
"rewards/cosine_scaled_reward/std": 0.39760199189186096,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1620.84375,
"completions/mean_terminated_length": 1071.6429443359375,
"completions/min_length": 504.0,
"completions/min_terminated_length": 504.0,
"epoch": 0.06857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2627163827419281,
"learning_rate": 8.997156826556369e-07,
"loss": 0.0,
"num_tokens": 7271682.0,
"reward": 0.01943434774875641,
"reward_std": 0.7573007345199585,
"rewards/cosine_scaled_reward/mean": -0.2246578335762024,
"rewards/cosine_scaled_reward/std": 0.3148350715637207,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2027.0,
"completions/mean_length": 1701.5625,
"completions/mean_terminated_length": 1040.181884765625,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.06971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21747122704982758,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0,
"num_tokens": 7392102.0,
"reward": 0.26178231835365295,
"reward_std": 0.6629467010498047,
"rewards/cosine_scaled_reward/mean": -0.10348384082317352,
"rewards/cosine_scaled_reward/std": 0.31626051664352417,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1704.0,
"completions/mean_length": 1174.796875,
"completions/mean_terminated_length": 907.4898071289062,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.07085714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2614337205886841,
"learning_rate": 8.896193111002475e-07,
"loss": 0.0,
"num_tokens": 7477521.0,
"reward": 1.0250537395477295,
"reward_std": 0.7894514799118042,
"rewards/cosine_scaled_reward/mean": 0.11408931016921997,
"rewards/cosine_scaled_reward/std": 0.5407090783119202,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1249.328125,
"completions/mean_terminated_length": 859.279052734375,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29820746183395386,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0,
"num_tokens": 7567734.0,
"reward": 0.7643536329269409,
"reward_std": 0.7760990858078003,
"rewards/cosine_scaled_reward/mean": 0.007176805287599564,
"rewards/cosine_scaled_reward/std": 0.4894968271255493,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 1257.546875,
"completions/mean_terminated_length": 1036.219970703125,
"completions/min_length": 544.0,
"completions/min_terminated_length": 544.0,
"epoch": 0.07314285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27683475613594055,
"learning_rate": 8.791091657286267e-07,
"loss": -0.0,
"num_tokens": 7659169.0,
"reward": 0.664791464805603,
"reward_std": 0.7692580223083496,
"rewards/cosine_scaled_reward/mean": -0.08947926759719849,
"rewards/cosine_scaled_reward/std": 0.4077052175998688,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1779.0,
"completions/mean_length": 1356.265625,
"completions/mean_terminated_length": 851.4865112304688,
"completions/min_length": 285.0,
"completions/min_terminated_length": 285.0,
"epoch": 0.07428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31939539313316345,
"learning_rate": 8.737029101523929e-07,
"loss": -0.0,
"num_tokens": 7756674.0,
"reward": 0.253554105758667,
"reward_std": 0.5708951950073242,
"rewards/cosine_scaled_reward/mean": -0.1700979471206665,
"rewards/cosine_scaled_reward/std": 0.4101679027080536,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 924.375,
"completions/mean_terminated_length": 763.857177734375,
"completions/min_length": 193.0,
"completions/min_terminated_length": 193.0,
"epoch": 0.07542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3720225393772125,
"learning_rate": 8.681980515339463e-07,
"loss": -0.0,
"num_tokens": 7826066.0,
"reward": 1.2181510925292969,
"reward_std": 0.8191297650337219,
"rewards/cosine_scaled_reward/mean": 0.15595056116580963,
"rewards/cosine_scaled_reward/std": 0.5347589254379272,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 1718.703125,
"completions/mean_terminated_length": 1090.0455322265625,
"completions/min_length": 418.0,
"completions/min_terminated_length": 418.0,
"epoch": 0.07657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2455451935529709,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0,
"num_tokens": 7946799.0,
"reward": -0.13946212828159332,
"reward_std": 0.4192533791065216,
"rewards/cosine_scaled_reward/mean": -0.26504355669021606,
"rewards/cosine_scaled_reward/std": 0.1596679985523224,
"rewards/format_reward/mean": 0.390625,
"rewards/format_reward/std": 0.4917473793029785,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1835.0,
"completions/mean_length": 796.03125,
"completions/mean_terminated_length": 642.2807006835938,
"completions/min_length": 263.0,
"completions/min_terminated_length": 263.0,
"epoch": 0.07771428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3650038540363312,
"learning_rate": 8.568992620281243e-07,
"loss": 0.0,
"num_tokens": 8007001.0,
"reward": 0.8269755840301514,
"reward_std": 0.7205700874328613,
"rewards/cosine_scaled_reward/mean": -0.03182470053434372,
"rewards/cosine_scaled_reward/std": 0.44225865602493286,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 1052.078125,
"completions/mean_terminated_length": 867.6481323242188,
"completions/min_length": 129.0,
"completions/min_terminated_length": 129.0,
"epoch": 0.07885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.5175226926803589,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0,
"num_tokens": 8084678.0,
"reward": 0.5431624054908752,
"reward_std": 0.5567936897277832,
"rewards/cosine_scaled_reward/mean": -0.15810629725456238,
"rewards/cosine_scaled_reward/std": 0.31712469458580017,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1940.0,
"completions/mean_length": 1458.953125,
"completions/mean_terminated_length": 1055.9210205078125,
"completions/min_length": 447.0,
"completions/min_terminated_length": 447.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2705329358577728,
"learning_rate": 8.452265630457282e-07,
"loss": -0.0,
"num_tokens": 8189507.0,
"reward": 0.2633436322212219,
"reward_std": 0.7909030914306641,
"rewards/cosine_scaled_reward/mean": -0.18864068388938904,
"rewards/cosine_scaled_reward/std": 0.37829747796058655,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1449.71875,
"completions/mean_terminated_length": 921.8235473632812,
"completions/min_length": 377.0,
"completions/min_terminated_length": 377.0,
"epoch": 0.08114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30838489532470703,
"learning_rate": 8.392544243589427e-07,
"loss": -0.0,
"num_tokens": 8293009.0,
"reward": 0.3965587615966797,
"reward_std": 0.6687955856323242,
"rewards/cosine_scaled_reward/mean": -0.06734561175107956,
"rewards/cosine_scaled_reward/std": 0.4826039671897888,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1638.0,
"completions/mean_length": 1165.0625,
"completions/mean_terminated_length": 917.8399658203125,
"completions/min_length": 330.0,
"completions/min_terminated_length": 330.0,
"epoch": 0.08228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37962082028388977,
"learning_rate": 8.331941759724268e-07,
"loss": -0.0,
"num_tokens": 8377925.0,
"reward": 0.48002344369888306,
"reward_std": 0.7248474359512329,
"rewards/cosine_scaled_reward/mean": -0.15842577815055847,
"rewards/cosine_scaled_reward/std": 0.3461473882198334,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1892.0,
"completions/mean_length": 1507.953125,
"completions/mean_terminated_length": 1205.0,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.08342857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.283483624458313,
"learning_rate": 8.270476638965461e-07,
"loss": -0.0,
"num_tokens": 8485162.0,
"reward": 0.12193681299686432,
"reward_std": 0.43324506282806396,
"rewards/cosine_scaled_reward/mean": -0.28278160095214844,
"rewards/cosine_scaled_reward/std": 0.2103184014558792,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1872.0,
"completions/mean_length": 1235.390625,
"completions/mean_terminated_length": 964.5208740234375,
"completions/min_length": 522.0,
"completions/min_terminated_length": 522.0,
"epoch": 0.08457142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2533995509147644,
"learning_rate": 8.208167604184217e-07,
"loss": 0.0,
"num_tokens": 8574155.0,
"reward": 0.711876630783081,
"reward_std": 0.598979651927948,
"rewards/cosine_scaled_reward/mean": -0.019061744213104248,
"rewards/cosine_scaled_reward/std": 0.5070863962173462,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1396.0,
"completions/mean_terminated_length": 1178.666748046875,
"completions/min_length": 526.0,
"completions/min_terminated_length": 526.0,
"epoch": 0.08571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21683472394943237,
"learning_rate": 8.145033635316128e-07,
"loss": -0.0,
"num_tokens": 8674459.0,
"reward": 0.8670482635498047,
"reward_std": 0.5205744504928589,
"rewards/cosine_scaled_reward/mean": 0.011649124324321747,
"rewards/cosine_scaled_reward/std": 0.46857622265815735,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 1208.875,
"completions/mean_terminated_length": 827.45458984375,
"completions/min_length": 295.0,
"completions/min_terminated_length": 295.0,
"epoch": 0.08685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29886162281036377,
"learning_rate": 8.081093963579707e-07,
"loss": 0.0,
"num_tokens": 8762227.0,
"reward": 0.33387672901153564,
"reward_std": 0.5217430591583252,
"rewards/cosine_scaled_reward/mean": -0.17681162059307098,
"rewards/cosine_scaled_reward/std": 0.33101972937583923,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1202.171875,
"completions/mean_terminated_length": 1026.6226806640625,
"completions/min_length": 617.0,
"completions/min_terminated_length": 617.0,
"epoch": 0.088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2510688602924347,
"learning_rate": 8.01636806561836e-07,
"loss": -0.0,
"num_tokens": 8850742.0,
"reward": 0.4464802145957947,
"reward_std": 0.452653169631958,
"rewards/cosine_scaled_reward/mean": -0.19863487780094147,
"rewards/cosine_scaled_reward/std": 0.29697054624557495,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1510.171875,
"completions/mean_terminated_length": 1035.61767578125,
"completions/min_length": 500.0,
"completions/min_terminated_length": 500.0,
"epoch": 0.08914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22833088040351868,
"learning_rate": 7.950875657567621e-07,
"loss": 0.0,
"num_tokens": 8958113.0,
"reward": 0.47979119420051575,
"reward_std": 0.697140097618103,
"rewards/cosine_scaled_reward/mean": -0.06479191035032272,
"rewards/cosine_scaled_reward/std": 0.45924264192581177,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1590.0,
"completions/mean_length": 1005.609375,
"completions/mean_terminated_length": 812.5740966796875,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.09028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31812578439712524,
"learning_rate": 7.884636689049422e-07,
"loss": -0.0,
"num_tokens": 9032568.0,
"reward": 0.5333245992660522,
"reward_std": 0.58503657579422,
"rewards/cosine_scaled_reward/mean": -0.15521270036697388,
"rewards/cosine_scaled_reward/std": 0.39932510256767273,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2018.0,
"completions/mean_length": 1430.390625,
"completions/mean_terminated_length": 1106.8809814453125,
"completions/min_length": 408.0,
"completions/min_terminated_length": 408.0,
"epoch": 0.09142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.317196786403656,
"learning_rate": 7.817671337095244e-07,
"loss": 0.0,
"num_tokens": 9134505.0,
"reward": 0.3816445469856262,
"reward_std": 0.7023000717163086,
"rewards/cosine_scaled_reward/mean": -0.16074024140834808,
"rewards/cosine_scaled_reward/std": 0.340556800365448,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 1082.9375,
"completions/mean_terminated_length": 983.1034545898438,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"epoch": 0.09257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3431136906147003,
"learning_rate": 7.75e-07,
"loss": -0.0,
"num_tokens": 9215085.0,
"reward": 0.5323719382286072,
"reward_std": 0.5667048692703247,
"rewards/cosine_scaled_reward/mean": -0.1869390308856964,
"rewards/cosine_scaled_reward/std": 0.3326209485530853,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1958.0,
"completions/mean_length": 1150.921875,
"completions/mean_terminated_length": 984.7963256835938,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.09371428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29262369871139526,
"learning_rate": 7.681643291108517e-07,
"loss": -0.0,
"num_tokens": 9299072.0,
"reward": 0.882739782333374,
"reward_std": 0.48830458521842957,
"rewards/cosine_scaled_reward/mean": 0.0038698911666870117,
"rewards/cosine_scaled_reward/std": 0.5622411966323853,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1688.0,
"completions/mean_length": 1255.515625,
"completions/mean_terminated_length": 968.872314453125,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"epoch": 0.09485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3286426365375519,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0,
"num_tokens": 9390513.0,
"reward": 0.5811824202537537,
"reward_std": 0.43479597568511963,
"rewards/cosine_scaled_reward/mean": -0.07659629732370377,
"rewards/cosine_scaled_reward/std": 0.3988858759403229,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1944.0,
"completions/mean_length": 1143.40625,
"completions/mean_terminated_length": 955.660400390625,
"completions/min_length": 321.0,
"completions/min_terminated_length": 321.0,
"epoch": 0.096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32434922456741333,
"learning_rate": 7.54295724882796e-07,
"loss": 0.0,
"num_tokens": 9474387.0,
"reward": 0.599439799785614,
"reward_std": 0.6882362961769104,
"rewards/cosine_scaled_reward/mean": -0.137780100107193,
"rewards/cosine_scaled_reward/std": 0.39472848176956177,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1309.71875,
"completions/mean_terminated_length": 1139.34619140625,
"completions/min_length": 707.0,
"completions/min_terminated_length": 707.0,
"epoch": 0.09714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1989050805568695,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0,
"num_tokens": 9568529.0,
"reward": 0.6224732398986816,
"reward_std": 0.6126816868782043,
"rewards/cosine_scaled_reward/mean": -0.12626340985298157,
"rewards/cosine_scaled_reward/std": 0.3711291551589966,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1932.0,
"completions/mean_length": 1287.6875,
"completions/mean_terminated_length": 1034.25,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.09828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2835865616798401,
"learning_rate": 7.401782177833147e-07,
"loss": 0.0,
"num_tokens": 9661797.0,
"reward": 0.6656298637390137,
"reward_std": 0.6712964773178101,
"rewards/cosine_scaled_reward/mean": -0.05781007558107376,
"rewards/cosine_scaled_reward/std": 0.3957676589488983,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1048.0,
"completions/mean_terminated_length": 862.8148193359375,
"completions/min_length": 429.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.09942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32077720761299133,
"learning_rate": 7.330314893841101e-07,
"loss": -0.0,
"num_tokens": 9738989.0,
"reward": 0.5565430521965027,
"reward_std": 0.48540347814559937,
"rewards/cosine_scaled_reward/mean": -0.15141595900058746,
"rewards/cosine_scaled_reward/std": 0.30698010325431824,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1026.359375,
"completions/mean_terminated_length": 939.7796630859375,
"completions/min_length": 142.0,
"completions/min_terminated_length": 142.0,
"epoch": 0.10057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31619754433631897,
"learning_rate": 7.258290078201731e-07,
"loss": -0.0,
"num_tokens": 9815188.0,
"reward": 1.1399941444396973,
"reward_std": 0.672067403793335,
"rewards/cosine_scaled_reward/mean": 0.07780956476926804,
"rewards/cosine_scaled_reward/std": 0.45696425437927246,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 1263.859375,
"completions/mean_terminated_length": 1135.54541015625,
"completions/min_length": 579.0,
"completions/min_terminated_length": 579.0,
"epoch": 0.10171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2358577847480774,
"learning_rate": 7.185729670371604e-07,
"loss": -0.0,
"num_tokens": 9907075.0,
"reward": 0.6882132291793823,
"reward_std": 0.7329428195953369,
"rewards/cosine_scaled_reward/mean": -0.10901839286088943,
"rewards/cosine_scaled_reward/std": 0.3872850835323334,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1843.0,
"completions/mean_length": 1060.53125,
"completions/mean_terminated_length": 898.9454345703125,
"completions/min_length": 127.0,
"completions/min_terminated_length": 127.0,
"epoch": 0.10285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.48015958070755005,
"learning_rate": 7.11265577295385e-07,
"loss": 0.0,
"num_tokens": 9984949.0,
"reward": 0.5475899577140808,
"reward_std": 0.6522415280342102,
"rewards/cosine_scaled_reward/mean": -0.1558925211429596,
"rewards/cosine_scaled_reward/std": 0.36059972643852234,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1377.046875,
"completions/mean_terminated_length": 1093.755615234375,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2658543288707733,
"learning_rate": 7.039090644965509e-07,
"loss": -0.0,
"num_tokens": 10083648.0,
"reward": 0.5291076302528381,
"reward_std": 0.7617174386978149,
"rewards/cosine_scaled_reward/mean": -0.11044619232416153,
"rewards/cosine_scaled_reward/std": 0.42394205927848816,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1595.0,
"completions/mean_length": 1180.59375,
"completions/mean_terminated_length": 915.0612182617188,
"completions/min_length": 486.0,
"completions/min_terminated_length": 486.0,
"epoch": 0.10514285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2867736518383026,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0,
"num_tokens": 10169198.0,
"reward": 0.5447607040405273,
"reward_std": 0.686552107334137,
"rewards/cosine_scaled_reward/mean": -0.11824464052915573,
"rewards/cosine_scaled_reward/std": 0.358975350856781,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1407.75,
"completions/mean_terminated_length": 1072.3809814453125,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.10628571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35998135805130005,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0,
"num_tokens": 10270638.0,
"reward": 0.34720176458358765,
"reward_std": 0.5368383526802063,
"rewards/cosine_scaled_reward/mean": -0.22483661770820618,
"rewards/cosine_scaled_reward/std": 0.23887285590171814,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1906.0,
"completions/mean_length": 1299.734375,
"completions/mean_terminated_length": 907.7857055664062,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.10742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37465807795524597,
"learning_rate": 6.815672671252315e-07,
"loss": 0.0,
"num_tokens": 10363589.0,
"reward": 0.7590231895446777,
"reward_std": 0.7117502689361572,
"rewards/cosine_scaled_reward/mean": 0.035761602222919464,
"rewards/cosine_scaled_reward/std": 0.48915430903434753,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1240.125,
"completions/mean_terminated_length": 1013.9199829101562,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.10857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29062458872795105,
"learning_rate": 6.740368101176495e-07,
"loss": -0.0,
"num_tokens": 10453485.0,
"reward": 0.2566743791103363,
"reward_std": 0.48482388257980347,
"rewards/cosine_scaled_reward/mean": -0.26228782534599304,
"rewards/cosine_scaled_reward/std": 0.20987816154956818,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1548.0,
"completions/mean_length": 1096.421875,
"completions/mean_terminated_length": 940.7090454101562,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"epoch": 0.10971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3024481236934662,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0,
"num_tokens": 10534792.0,
"reward": 0.84682697057724,
"reward_std": 0.4051811099052429,
"rewards/cosine_scaled_reward/mean": -0.014086522161960602,
"rewards/cosine_scaled_reward/std": 0.4226605296134949,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1858.0,
"completions/mean_length": 1193.0,
"completions/mean_terminated_length": 1034.6666259765625,
"completions/min_length": 380.0,
"completions/min_terminated_length": 380.0,
"epoch": 0.11085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28268054127693176,
"learning_rate": 6.588648530198504e-07,
"loss": -0.0,
"num_tokens": 10621408.0,
"reward": 0.6044580340385437,
"reward_std": 0.6774412393569946,
"rewards/cosine_scaled_reward/mean": -0.14308346807956696,
"rewards/cosine_scaled_reward/std": 0.3645778298377991,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1966.0,
"completions/mean_length": 1146.03125,
"completions/mean_terminated_length": 958.8302001953125,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2698092758655548,
"learning_rate": 6.512279744547392e-07,
"loss": -0.0,
"num_tokens": 10705818.0,
"reward": 0.8127368092536926,
"reward_std": 0.5926570892333984,
"rewards/cosine_scaled_reward/mean": -0.031131573021411896,
"rewards/cosine_scaled_reward/std": 0.4754258096218109,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1057.703125,
"completions/mean_terminated_length": 1008.9999389648438,
"completions/min_length": 414.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.11314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3098371922969818,
"learning_rate": 6.435602608679916e-07,
"loss": -0.0,
"num_tokens": 10784679.0,
"reward": 0.9630225300788879,
"reward_std": 0.48498910665512085,
"rewards/cosine_scaled_reward/mean": 0.01276126503944397,
"rewards/cosine_scaled_reward/std": 0.48310962319374084,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1873.0,
"completions/mean_length": 1139.109375,
"completions/mean_terminated_length": 1062.084716796875,
"completions/min_length": 478.0,
"completions/min_terminated_length": 478.0,
"epoch": 0.11428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29804834723472595,
"learning_rate": 6.358640479194451e-07,
"loss": 0.0,
"num_tokens": 10867294.0,
"reward": 0.958106279373169,
"reward_std": 0.757602334022522,
"rewards/cosine_scaled_reward/mean": 0.010303134098649025,
"rewards/cosine_scaled_reward/std": 0.4540289342403412,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1917.0,
"completions/mean_length": 1068.21875,
"completions/mean_terminated_length": 1052.666748046875,
"completions/min_length": 539.0,
"completions/min_terminated_length": 539.0,
"epoch": 0.11542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30073827505111694,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0,
"num_tokens": 10947092.0,
"reward": 0.7335419058799744,
"reward_std": 0.48280423879623413,
"rewards/cosine_scaled_reward/mean": -0.12541653215885162,
"rewards/cosine_scaled_reward/std": 0.34734830260276794,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1032.203125,
"completions/mean_terminated_length": 907.4561767578125,
"completions/min_length": 304.0,
"completions/min_terminated_length": 304.0,
"epoch": 0.11657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34232160449028015,
"learning_rate": 6.203955092681039e-07,
"loss": 0.0,
"num_tokens": 11023305.0,
"reward": 0.5640091300010681,
"reward_std": 0.6805330514907837,
"rewards/cosine_scaled_reward/mean": -0.16330792009830475,
"rewards/cosine_scaled_reward/std": 0.3974398970603943,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 904.328125,
"completions/mean_terminated_length": 886.1746826171875,
"completions/min_length": 163.0,
"completions/min_terminated_length": 163.0,
"epoch": 0.11771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.43588608503341675,
"learning_rate": 6.126278954320294e-07,
"loss": -0.0,
"num_tokens": 11091534.0,
"reward": 0.7056660056114197,
"reward_std": 0.5587431788444519,
"rewards/cosine_scaled_reward/mean": -0.13935449719429016,
"rewards/cosine_scaled_reward/std": 0.32663995027542114,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1530.0,
"completions/mean_length": 1109.765625,
"completions/mean_terminated_length": 870.6078491210938,
"completions/min_length": 272.0,
"completions/min_terminated_length": 272.0,
"epoch": 0.11885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2997737228870392,
"learning_rate": 6.048412045323164e-07,
"loss": -0.0,
"num_tokens": 11173023.0,
"reward": 0.5046586990356445,
"reward_std": 0.5760527849197388,
"rewards/cosine_scaled_reward/mean": -0.14610813558101654,
"rewards/cosine_scaled_reward/std": 0.366825133562088,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1184.390625,
"completions/mean_terminated_length": 920.0203857421875,
"completions/min_length": 440.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.12,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2738684117794037,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0,
"num_tokens": 11260112.0,
"reward": 0.934418797492981,
"reward_std": 0.6995881795883179,
"rewards/cosine_scaled_reward/mean": 0.05314689874649048,
"rewards/cosine_scaled_reward/std": 0.5037528872489929,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1980.0,
"completions/mean_length": 984.8125,
"completions/mean_terminated_length": 874.8275756835938,
"completions/min_length": 258.0,
"completions/min_terminated_length": 258.0,
"epoch": 0.12114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30534854531288147,
"learning_rate": 5.892200842364462e-07,
"loss": 0.0,
"num_tokens": 11333972.0,
"reward": 1.1282094717025757,
"reward_std": 0.701350748538971,
"rewards/cosine_scaled_reward/mean": 0.10316723585128784,
"rewards/cosine_scaled_reward/std": 0.4879910945892334,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1693.0,
"completions/mean_length": 1115.203125,
"completions/mean_terminated_length": 942.4629516601562,
"completions/min_length": 353.0,
"completions/min_terminated_length": 353.0,
"epoch": 0.12228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3157402276992798,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0,
"num_tokens": 11416497.0,
"reward": 0.6792718768119812,
"reward_std": 0.6421718597412109,
"rewards/cosine_scaled_reward/mean": -0.08223908394575119,
"rewards/cosine_scaled_reward/std": 0.4252789616584778,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1895.0,
"completions/mean_length": 1252.359375,
"completions/mean_terminated_length": 1008.7958984375,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.12342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32090920209884644,
"learning_rate": 5.735511803093248e-07,
"loss": -0.0,
"num_tokens": 11507008.0,
"reward": 0.5321451425552368,
"reward_std": 0.7731401324272156,
"rewards/cosine_scaled_reward/mean": -0.124552421271801,
"rewards/cosine_scaled_reward/std": 0.39858752489089966,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 1091.1875,
"completions/mean_terminated_length": 973.6842041015625,
"completions/min_length": 552.0,
"completions/min_terminated_length": 552.0,
"epoch": 0.12457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27098071575164795,
"learning_rate": 5.657047735161255e-07,
"loss": -0.0,
"num_tokens": 11588092.0,
"reward": 0.8329494595527649,
"reward_std": 0.48314613103866577,
"rewards/cosine_scaled_reward/mean": -0.028837747871875763,
"rewards/cosine_scaled_reward/std": 0.4350675046443939,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 1152.53125,
"completions/mean_terminated_length": 986.7037353515625,
"completions/min_length": 265.0,
"completions/min_terminated_length": 265.0,
"epoch": 0.12571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4122578799724579,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0,
"num_tokens": 11672630.0,
"reward": 0.5694496631622314,
"reward_std": 0.6106870174407959,
"rewards/cosine_scaled_reward/mean": -0.14496266841888428,
"rewards/cosine_scaled_reward/std": 0.3454693555831909,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1783.0,
"completions/mean_length": 971.0,
"completions/mean_terminated_length": 918.03271484375,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.12685714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2827076315879822,
"learning_rate": 5.5e-07,
"loss": -0.0,
"num_tokens": 11744894.0,
"reward": 0.7054448127746582,
"reward_std": 0.5191388130187988,
"rewards/cosine_scaled_reward/mean": -0.1238400787115097,
"rewards/cosine_scaled_reward/std": 0.2727503180503845,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1988.0,
"completions/mean_length": 1287.234375,
"completions/mean_terminated_length": 1111.673095703125,
"completions/min_length": 605.0,
"completions/min_terminated_length": 605.0,
"epoch": 0.128,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26578545570373535,
"learning_rate": 5.421464171032224e-07,
"loss": -0.0,
"num_tokens": 11838373.0,
"reward": 0.8936529159545898,
"reward_std": 0.677398681640625,
"rewards/cosine_scaled_reward/mean": 0.032763972878456116,
"rewards/cosine_scaled_reward/std": 0.47608643770217896,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1969.0,
"completions/mean_length": 907.5,
"completions/mean_terminated_length": 870.7096557617188,
"completions/min_length": 314.0,
"completions/min_terminated_length": 314.0,
"epoch": 0.12914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36232173442840576,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0,
"num_tokens": 11906749.0,
"reward": 0.9407690763473511,
"reward_std": 0.6962294578552246,
"rewards/cosine_scaled_reward/mean": -0.013990461826324463,
"rewards/cosine_scaled_reward/std": 0.4877306818962097,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 946.78125,
"completions/mean_terminated_length": 873.36669921875,
"completions/min_length": 250.0,
"completions/min_terminated_length": 250.0,
"epoch": 0.13028571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3120001256465912,
"learning_rate": 5.264488196906752e-07,
"loss": 0.0,
"num_tokens": 11977191.0,
"reward": 0.61952805519104,
"reward_std": 0.609375536441803,
"rewards/cosine_scaled_reward/mean": -0.16679848730564117,
"rewards/cosine_scaled_reward/std": 0.37943220138549805,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 1104.734375,
"completions/mean_terminated_length": 969.982177734375,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.13142857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33438733220100403,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0,
"num_tokens": 12059110.0,
"reward": 0.5532407760620117,
"reward_std": 0.576167643070221,
"rewards/cosine_scaled_reward/mean": -0.16087961196899414,
"rewards/cosine_scaled_reward/std": 0.40955987572669983,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1260.59375,
"completions/mean_terminated_length": 1059.8824462890625,
"completions/min_length": 361.0,
"completions/min_terminated_length": 361.0,
"epoch": 0.13257142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27838730812072754,
"learning_rate": 5.107799157635538e-07,
"loss": -0.0,
"num_tokens": 12151172.0,
"reward": 0.7721755504608154,
"reward_std": 0.7768255472183228,
"rewards/cosine_scaled_reward/mean": -0.027974726632237434,
"rewards/cosine_scaled_reward/std": 0.42550837993621826,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 1062.296875,
"completions/mean_terminated_length": 978.7626953125,
"completions/min_length": 440.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.1337142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2676664888858795,
"learning_rate": 5.02962191529556e-07,
"loss": -0.0,
"num_tokens": 12230183.0,
"reward": 0.7667758464813232,
"reward_std": 0.6048427820205688,
"rewards/cosine_scaled_reward/mean": -0.10879956185817719,
"rewards/cosine_scaled_reward/std": 0.4106774926185608,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1925.0,
"completions/mean_length": 1205.96875,
"completions/mean_terminated_length": 1011.6538696289062,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.13485714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33676981925964355,
"learning_rate": 4.951587954676837e-07,
"loss": -0.0,
"num_tokens": 12317901.0,
"reward": 0.7116703987121582,
"reward_std": 0.7047961950302124,
"rewards/cosine_scaled_reward/mean": -0.05041477829217911,
"rewards/cosine_scaled_reward/std": 0.4400728642940521,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1943.0,
"completions/mean_length": 963.578125,
"completions/mean_terminated_length": 851.3965454101562,
"completions/min_length": 162.0,
"completions/min_terminated_length": 162.0,
"epoch": 0.136,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.6137372851371765,
"learning_rate": 4.873721045679706e-07,
"loss": 0.0,
"num_tokens": 12389978.0,
"reward": 0.884113073348999,
"reward_std": 0.64817214012146,
"rewards/cosine_scaled_reward/mean": -0.011068470776081085,
"rewards/cosine_scaled_reward/std": 0.43141883611679077,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 804.75,
"completions/mean_terminated_length": 764.6451416015625,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.13714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.39063799381256104,
"learning_rate": 4.79604490731896e-07,
"loss": -0.0,
"num_tokens": 12451938.0,
"reward": 0.7966957092285156,
"reward_std": 0.5527613759040833,
"rewards/cosine_scaled_reward/mean": -0.093839630484581,
"rewards/cosine_scaled_reward/std": 0.43766382336616516,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1688.0,
"completions/mean_length": 711.921875,
"completions/mean_terminated_length": 690.71435546875,
"completions/min_length": 225.0,
"completions/min_terminated_length": 225.0,
"epoch": 0.1382857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.4063321352005005,
"learning_rate": 4.7185832004988133e-07,
"loss": 0.0,
"num_tokens": 12507197.0,
"reward": 1.1037222146987915,
"reward_std": 0.5403161644935608,
"rewards/cosine_scaled_reward/mean": 0.059673577547073364,
"rewards/cosine_scaled_reward/std": 0.4637821912765503,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1144.90625,
"completions/mean_terminated_length": 1015.8928833007812,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.13942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3278358280658722,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0,
"num_tokens": 12592031.0,
"reward": 0.9771745800971985,
"reward_std": 0.6282449960708618,
"rewards/cosine_scaled_reward/mean": 0.00421229749917984,
"rewards/cosine_scaled_reward/std": 0.4215405285358429,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1899.0,
"completions/mean_length": 1109.59375,
"completions/mean_terminated_length": 1012.5172119140625,
"completions/min_length": 360.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.14057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2767045497894287,
"learning_rate": 4.5643973913200837e-07,
"loss": -0.0,
"num_tokens": 12673277.0,
"reward": 0.650445818901062,
"reward_std": 0.5948874950408936,
"rewards/cosine_scaled_reward/mean": -0.1435271054506302,
"rewards/cosine_scaled_reward/std": 0.39249518513679504,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 1095.515625,
"completions/mean_terminated_length": 978.5438842773438,
"completions/min_length": 381.0,
"completions/min_terminated_length": 381.0,
"epoch": 0.1417142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2732603847980499,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0,
"num_tokens": 12754342.0,
"reward": 0.8451436758041382,
"reward_std": 0.7594490051269531,
"rewards/cosine_scaled_reward/mean": -0.0383656844496727,
"rewards/cosine_scaled_reward/std": 0.45196905732154846,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1931.0,
"completions/mean_length": 1066.4375,
"completions/mean_terminated_length": 1034.774169921875,
"completions/min_length": 395.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.14285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28073129057884216,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0,
"num_tokens": 12833522.0,
"reward": 1.1210604906082153,
"reward_std": 0.480854868888855,
"rewards/cosine_scaled_reward/mean": 0.06834279000759125,
"rewards/cosine_scaled_reward/std": 0.5010288953781128,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 1028.75,
"completions/mean_terminated_length": 978.6229248046875,
"completions/min_length": 575.0,
"completions/min_terminated_length": 575.0,
"epoch": 0.144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2878725826740265,
"learning_rate": 4.3353142970386557e-07,
"loss": -0.0,
"num_tokens": 12910658.0,
"reward": 1.0619797706604004,
"reward_std": 0.7742013335227966,
"rewards/cosine_scaled_reward/mean": 0.04661493003368378,
"rewards/cosine_scaled_reward/std": 0.4794113337993622,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1780.0,
"completions/mean_length": 1057.734375,
"completions/mean_terminated_length": 991.7167358398438,
"completions/min_length": 528.0,
"completions/min_terminated_length": 528.0,
"epoch": 0.14514285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.314113050699234,
"learning_rate": 4.2596318988235037e-07,
"loss": -0.0,
"num_tokens": 12989449.0,
"reward": 0.5687937140464783,
"reward_std": 0.5123973488807678,
"rewards/cosine_scaled_reward/mean": -0.19997814297676086,
"rewards/cosine_scaled_reward/std": 0.29952365159988403,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 970.15625,
"completions/mean_terminated_length": 917.1474609375,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.1462857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3059556186199188,
"learning_rate": 4.1843273287476854e-07,
"loss": -0.0,
"num_tokens": 13061891.0,
"reward": 0.986152708530426,
"reward_std": 0.6476150751113892,
"rewards/cosine_scaled_reward/mean": 0.0008888617157936096,
"rewards/cosine_scaled_reward/std": 0.47057685256004333,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1270.71875,
"completions/mean_terminated_length": 1175.26318359375,
"completions/min_length": 420.0,
"completions/min_terminated_length": 420.0,
"epoch": 0.14742857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26219478249549866,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0,
"num_tokens": 13153921.0,
"reward": 0.7368666529655457,
"reward_std": 0.6195722818374634,
"rewards/cosine_scaled_reward/mean": -0.08469165861606598,
"rewards/cosine_scaled_reward/std": 0.4251137375831604,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 1131.328125,
"completions/mean_terminated_length": 941.0755004882812,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.14857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31731143593788147,
"learning_rate": 4.034943304942796e-07,
"loss": -0.0,
"num_tokens": 13236830.0,
"reward": 0.6148363351821899,
"reward_std": 0.5969675183296204,
"rewards/cosine_scaled_reward/mean": -0.13008181750774384,
"rewards/cosine_scaled_reward/std": 0.34181615710258484,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 941.3125,
"completions/mean_terminated_length": 826.8275756835938,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.14971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3140406608581543,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.0,
"num_tokens": 13306810.0,
"reward": 1.0727087259292603,
"reward_std": 0.5550357699394226,
"rewards/cosine_scaled_reward/mean": 0.07541687786579132,
"rewards/cosine_scaled_reward/std": 0.4260079562664032,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1545.0,
"completions/mean_length": 1367.53125,
"completions/mean_terminated_length": 870.9730224609375,
"completions/min_length": 340.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.15085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2837361693382263,
"learning_rate": 3.8873442270461485e-07,
"loss": -0.0,
"num_tokens": 13405396.0,
"reward": 0.5292781591415405,
"reward_std": 0.6481244564056396,
"rewards/cosine_scaled_reward/mean": -0.04786092787981033,
"rewards/cosine_scaled_reward/std": 0.4380926489830017,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1924.0,
"completions/mean_length": 1180.875,
"completions/mean_terminated_length": 959.8432006835938,
"completions/min_length": 252.0,
"completions/min_terminated_length": 252.0,
"epoch": 0.152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36373820900917053,
"learning_rate": 3.8142703296283953e-07,
"loss": -0.0,
"num_tokens": 13492196.0,
"reward": 0.5728870630264282,
"reward_std": 0.6083178520202637,
"rewards/cosine_scaled_reward/mean": -0.1276189684867859,
"rewards/cosine_scaled_reward/std": 0.3563939332962036,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1936.0,
"completions/mean_length": 1008.140625,
"completions/mean_terminated_length": 956.9999389648438,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.15314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33836495876312256,
"learning_rate": 3.7417099217982686e-07,
"loss": 0.0,
"num_tokens": 13567285.0,
"reward": 1.2238911390304565,
"reward_std": 0.4800982177257538,
"rewards/cosine_scaled_reward/mean": 0.11194555461406708,
"rewards/cosine_scaled_reward/std": 0.5184221267700195,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1599.0,
"completions/mean_length": 723.609375,
"completions/mean_terminated_length": 680.8870849609375,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.15428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3624517619609833,
"learning_rate": 3.6696851061588994e-07,
"loss": -0.0,
"num_tokens": 13624068.0,
"reward": 1.5145277976989746,
"reward_std": 0.5155797004699707,
"rewards/cosine_scaled_reward/mean": 0.2650764584541321,
"rewards/cosine_scaled_reward/std": 0.4171845614910126,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1915.0,
"completions/mean_length": 1110.328125,
"completions/mean_terminated_length": 1013.3275756835938,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.15542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32373178005218506,
"learning_rate": 3.5982178221668533e-07,
"loss": -0.0,
"num_tokens": 13705801.0,
"reward": 1.1360313892364502,
"reward_std": 0.668129563331604,
"rewards/cosine_scaled_reward/mean": 0.06801574677228928,
"rewards/cosine_scaled_reward/std": 0.5162939429283142,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2020.0,
"completions/mean_length": 1127.09375,
"completions/mean_terminated_length": 1112.4761962890625,
"completions/min_length": 411.0,
"completions/min_terminated_length": 411.0,
"epoch": 0.15657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30264437198638916,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.0,
"num_tokens": 13788543.0,
"reward": 0.6311769485473633,
"reward_std": 0.5873199701309204,
"rewards/cosine_scaled_reward/mean": -0.18441152572631836,
"rewards/cosine_scaled_reward/std": 0.3330920338630676,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1032.8125,
"completions/mean_terminated_length": 946.7796630859375,
"completions/min_length": 400.0,
"completions/min_terminated_length": 400.0,
"epoch": 0.15771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2963666617870331,
"learning_rate": 3.45704275117204e-07,
"loss": -0.0,
"num_tokens": 13865955.0,
"reward": 0.5941890478134155,
"reward_std": 0.553059995174408,
"rewards/cosine_scaled_reward/mean": -0.19509297609329224,
"rewards/cosine_scaled_reward/std": 0.34852251410484314,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1111.96875,
"completions/mean_terminated_length": 1015.137939453125,
"completions/min_length": 178.0,
"completions/min_terminated_length": 178.0,
"epoch": 0.15885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3229401707649231,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0,
"num_tokens": 13947761.0,
"reward": 0.6673665046691895,
"reward_std": 0.5299196243286133,
"rewards/cosine_scaled_reward/mean": -0.16631674766540527,
"rewards/cosine_scaled_reward/std": 0.38788118958473206,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 980.09375,
"completions/mean_terminated_length": 945.6451416015625,
"completions/min_length": 270.0,
"completions/min_terminated_length": 270.0,
"epoch": 0.16,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.34156280755996704,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0,
"num_tokens": 14020711.0,
"reward": 0.8693222999572754,
"reward_std": 0.5208969116210938,
"rewards/cosine_scaled_reward/mean": -0.0653388649225235,
"rewards/cosine_scaled_reward/std": 0.5035129189491272,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1492.0,
"completions/mean_length": 1007.890625,
"completions/mean_terminated_length": 919.7457885742188,
"completions/min_length": 446.0,
"completions/min_terminated_length": 446.0,
"epoch": 0.16114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3044126331806183,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0,
"num_tokens": 14095776.0,
"reward": 1.0021867752075195,
"reward_std": 0.6079363822937012,
"rewards/cosine_scaled_reward/mean": 0.0010933950543403625,
"rewards/cosine_scaled_reward/std": 0.4816957116127014,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 1149.90625,
"completions/mean_terminated_length": 1073.796630859375,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.16228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30035167932510376,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0,
"num_tokens": 14179874.0,
"reward": 0.6333685517311096,
"reward_std": 0.41481128334999084,
"rewards/cosine_scaled_reward/mean": -0.1755032241344452,
"rewards/cosine_scaled_reward/std": 0.2467116117477417,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1816.0,
"completions/mean_length": 906.671875,
"completions/mean_terminated_length": 888.5556030273438,
"completions/min_length": 422.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.16342857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.357653945684433,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0,
"num_tokens": 14248717.0,
"reward": 0.6540926098823547,
"reward_std": 0.46782517433166504,
"rewards/cosine_scaled_reward/mean": -0.16514119505882263,
"rewards/cosine_scaled_reward/std": 0.28250446915626526,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1934.0,
"completions/mean_length": 1020.765625,
"completions/mean_terminated_length": 894.614013671875,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.16457142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35631585121154785,
"learning_rate": 3.0491243424323783e-07,
"loss": 0.0,
"num_tokens": 14325534.0,
"reward": 1.0688426494598389,
"reward_std": 0.7873537540435791,
"rewards/cosine_scaled_reward/mean": 0.08129630982875824,
"rewards/cosine_scaled_reward/std": 0.5166342258453369,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1719.0,
"completions/mean_length": 842.203125,
"completions/mean_terminated_length": 823.0635375976562,
"completions/min_length": 186.0,
"completions/min_terminated_length": 186.0,
"epoch": 0.1657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.40442949533462524,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0,
"num_tokens": 14389379.0,
"reward": 0.8424907922744751,
"reward_std": 0.612415611743927,
"rewards/cosine_scaled_reward/mean": -0.07094208896160126,
"rewards/cosine_scaled_reward/std": 0.4410366714000702,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1026.828125,
"completions/mean_terminated_length": 976.6065063476562,
"completions/min_length": 440.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.16685714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28185784816741943,
"learning_rate": 2.918906036420294e-07,
"loss": 0.0,
"num_tokens": 14465712.0,
"reward": 0.560012698173523,
"reward_std": 0.4264100193977356,
"rewards/cosine_scaled_reward/mean": -0.21999366581439972,
"rewards/cosine_scaled_reward/std": 0.2619490623474121,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1906.0,
"completions/mean_length": 1220.9375,
"completions/mean_terminated_length": 1119.368408203125,
"completions/min_length": 621.0,
"completions/min_terminated_length": 621.0,
"epoch": 0.168,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2767592966556549,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0,
"num_tokens": 14554636.0,
"reward": 0.8378211259841919,
"reward_std": 0.6600607633590698,
"rewards/cosine_scaled_reward/mean": -0.07327694445848465,
"rewards/cosine_scaled_reward/std": 0.4367770254611969,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1039.28125,
"completions/mean_terminated_length": 915.4035034179688,
"completions/min_length": 317.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.16914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31175175309181213,
"learning_rate": 2.791832395815782e-07,
"loss": 0.0,
"num_tokens": 14632334.0,
"reward": 0.749801754951477,
"reward_std": 0.5025944709777832,
"rewards/cosine_scaled_reward/mean": -0.10166161507368088,
"rewards/cosine_scaled_reward/std": 0.4026789367198944,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1362.421875,
"completions/mean_terminated_length": 1050.7955322265625,
"completions/min_length": 427.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.1702857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2945536971092224,
"learning_rate": 2.729523361034538e-07,
"loss": -0.0,
"num_tokens": 14731425.0,
"reward": 0.5095837116241455,
"reward_std": 0.9072202444076538,
"rewards/cosine_scaled_reward/mean": -0.12020813673734665,
"rewards/cosine_scaled_reward/std": 0.44057199358940125,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1051.125,
"completions/mean_terminated_length": 984.666748046875,
"completions/min_length": 422.0,
"completions/min_terminated_length": 422.0,
"epoch": 0.17142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31511664390563965,
"learning_rate": 2.6680582402757324e-07,
"loss": 0.0,
"num_tokens": 14809201.0,
"reward": 0.8506758213043213,
"reward_std": 0.6328262686729431,
"rewards/cosine_scaled_reward/mean": -0.06684959679841995,
"rewards/cosine_scaled_reward/std": 0.4310523569583893,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 1043.53125,
"completions/mean_terminated_length": 900.0357666015625,
"completions/min_length": 352.0,
"completions/min_terminated_length": 352.0,
"epoch": 0.17257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.317804217338562,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0,
"num_tokens": 14886667.0,
"reward": 1.1822679042816162,
"reward_std": 0.8676217794418335,
"rewards/cosine_scaled_reward/mean": 0.13019640743732452,
"rewards/cosine_scaled_reward/std": 0.5323836207389832,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1948.0,
"completions/mean_length": 1029.5,
"completions/mean_terminated_length": 979.4097900390625,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.1737142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.41914206743240356,
"learning_rate": 2.547734369542718e-07,
"loss": 0.0,
"num_tokens": 14963219.0,
"reward": 0.8113790154457092,
"reward_std": 0.7262269258499146,
"rewards/cosine_scaled_reward/mean": -0.07868549972772598,
"rewards/cosine_scaled_reward/std": 0.46254217624664307,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1182.75,
"completions/mean_terminated_length": 1041.16357421875,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.17485714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30266106128692627,
"learning_rate": 2.488912271385139e-07,
"loss": -0.0,
"num_tokens": 15050059.0,
"reward": 0.5501826405525208,
"reward_std": 0.3770068287849426,
"rewards/cosine_scaled_reward/mean": -0.19365867972373962,
"rewards/cosine_scaled_reward/std": 0.18398644030094147,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1711.0,
"completions/mean_length": 1236.765625,
"completions/mean_terminated_length": 966.3541870117188,
"completions/min_length": 473.0,
"completions/min_terminated_length": 473.0,
"epoch": 0.176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31099703907966614,
"learning_rate": 2.4310073797187573e-07,
"loss": -0.0,
"num_tokens": 15140276.0,
"reward": 0.590886116027832,
"reward_std": 0.6541597843170166,
"rewards/cosine_scaled_reward/mean": -0.1342444270849228,
"rewards/cosine_scaled_reward/std": 0.36679577827453613,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1770.0,
"completions/mean_length": 910.109375,
"completions/mean_terminated_length": 792.3965454101562,
"completions/min_length": 320.0,
"completions/min_terminated_length": 320.0,
"epoch": 0.17714285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35653260350227356,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0,
"num_tokens": 15209147.0,
"reward": 0.8104115724563599,
"reward_std": 0.6592832803726196,
"rewards/cosine_scaled_reward/mean": -0.06354419887065887,
"rewards/cosine_scaled_reward/std": 0.4711204171180725,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1213.96875,
"completions/mean_terminated_length": 980.4400024414062,
"completions/min_length": 393.0,
"completions/min_terminated_length": 393.0,
"epoch": 0.1782857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29153549671173096,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.0,
"num_tokens": 15296945.0,
"reward": 0.6139351725578308,
"reward_std": 0.6581733226776123,
"rewards/cosine_scaled_reward/mean": -0.1070949137210846,
"rewards/cosine_scaled_reward/std": 0.4280206561088562,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1971.0,
"completions/mean_length": 1071.921875,
"completions/mean_terminated_length": 1023.91796875,
"completions/min_length": 455.0,
"completions/min_terminated_length": 455.0,
"epoch": 0.17942857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3182876706123352,
"learning_rate": 2.2629708984760706e-07,
"loss": -0.0,
"num_tokens": 15375508.0,
"reward": 0.6302845478057861,
"reward_std": 0.7132326364517212,
"rewards/cosine_scaled_reward/mean": -0.16142022609710693,
"rewards/cosine_scaled_reward/std": 0.37519919872283936,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 943.375,
"completions/mean_terminated_length": 907.7418823242188,
"completions/min_length": 136.0,
"completions/min_terminated_length": 136.0,
"epoch": 0.18057142857142858,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.2808084189891815,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0,
"num_tokens": 15446004.0,
"reward": 0.9674867391586304,
"reward_std": 0.5017939805984497,
"rewards/cosine_scaled_reward/mean": -0.008444137871265411,
"rewards/cosine_scaled_reward/std": 0.49054765701293945,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1937.0,
"completions/mean_length": 1154.34375,
"completions/mean_terminated_length": 1026.6785888671875,
"completions/min_length": 237.0,
"completions/min_terminated_length": 237.0,
"epoch": 0.18171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2708660066127777,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.0,
"num_tokens": 15531562.0,
"reward": 0.5022876262664795,
"reward_std": 0.5563845038414001,
"rewards/cosine_scaled_reward/mean": -0.20979365706443787,
"rewards/cosine_scaled_reward/std": 0.2771652638912201,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0,
"completions/max_length": 2000.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 900.078125,
"completions/mean_terminated_length": 900.078125,
"completions/min_length": 384.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.18285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3298548460006714,
"learning_rate": 2.1038068889975259e-07,
"loss": -0.0,
"num_tokens": 15600751.0,
"reward": 1.3369240760803223,
"reward_std": 0.6572985053062439,
"rewards/cosine_scaled_reward/mean": 0.16846203804016113,
"rewards/cosine_scaled_reward/std": 0.5345699787139893,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1737.0,
"completions/mean_length": 1066.46875,
"completions/mean_terminated_length": 926.2500610351562,
"completions/min_length": 326.0,
"completions/min_terminated_length": 326.0,
"epoch": 0.184,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31214678287506104,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0,
"num_tokens": 15678997.0,
"reward": 0.7608721256256104,
"reward_std": 0.6540825366973877,
"rewards/cosine_scaled_reward/mean": -0.08831392973661423,
"rewards/cosine_scaled_reward/std": 0.35966333746910095,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1089.203125,
"completions/mean_terminated_length": 952.232177734375,
"completions/min_length": 470.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.18514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3335266411304474,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0,
"num_tokens": 15759010.0,
"reward": 0.6903920769691467,
"reward_std": 0.522528886795044,
"rewards/cosine_scaled_reward/mean": -0.10792896896600723,
"rewards/cosine_scaled_reward/std": 0.35296061635017395,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1494.0,
"completions/mean_length": 1043.9375,
"completions/mean_terminated_length": 812.2307739257812,
"completions/min_length": 405.0,
"completions/min_terminated_length": 405.0,
"epoch": 0.18628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33791008591651917,
"learning_rate": 1.9539516087697517e-07,
"loss": -0.0,
"num_tokens": 15837006.0,
"reward": 1.0535857677459717,
"reward_std": 0.7004721164703369,
"rewards/cosine_scaled_reward/mean": 0.12054289877414703,
"rewards/cosine_scaled_reward/std": 0.5006844997406006,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1648.0,
"completions/mean_length": 933.0,
"completions/mean_terminated_length": 858.6666870117188,
"completions/min_length": 340.0,
"completions/min_terminated_length": 340.0,
"epoch": 0.18742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3110557794570923,
"learning_rate": 1.9061402047871833e-07,
"loss": 0.0,
"num_tokens": 15907246.0,
"reward": 1.0376479625701904,
"reward_std": 0.5748878121376038,
"rewards/cosine_scaled_reward/mean": 0.04226145148277283,
"rewards/cosine_scaled_reward/std": 0.48099249601364136,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1855.0,
"completions/mean_length": 1060.765625,
"completions/mean_terminated_length": 939.5263061523438,
"completions/min_length": 329.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.18857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3239347040653229,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0,
"num_tokens": 15986551.0,
"reward": 0.8562759160995483,
"reward_std": 0.5603832602500916,
"rewards/cosine_scaled_reward/mean": -0.06404951959848404,
"rewards/cosine_scaled_reward/std": 0.4168683588504791,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1649.0,
"completions/mean_length": 1088.78125,
"completions/mean_terminated_length": 951.7500610351562,
"completions/min_length": 439.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.18971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2883976697921753,
"learning_rate": 1.8138158006995363e-07,
"loss": -0.0,
"num_tokens": 16067809.0,
"reward": 0.7584704160690308,
"reward_std": 0.6604156494140625,
"rewards/cosine_scaled_reward/mean": -0.05826478824019432,
"rewards/cosine_scaled_reward/std": 0.3981381356716156,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1756.0,
"completions/mean_length": 939.125,
"completions/mean_terminated_length": 903.3547973632812,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"epoch": 0.19085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2965018153190613,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0,
"num_tokens": 16139457.0,
"reward": 0.7410329580307007,
"reward_std": 0.5566695928573608,
"rewards/cosine_scaled_reward/mean": -0.11385852843523026,
"rewards/cosine_scaled_reward/std": 0.3546922504901886,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1825.0,
"completions/mean_length": 1022.125,
"completions/mean_terminated_length": 1005.84130859375,
"completions/min_length": 382.0,
"completions/min_terminated_length": 382.0,
"epoch": 0.192,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2879563868045807,
"learning_rate": 1.7259824442455923e-07,
"loss": -0.0,
"num_tokens": 16215713.0,
"reward": 0.8576459288597107,
"reward_std": 0.6195322275161743,
"rewards/cosine_scaled_reward/mean": -0.06336455047130585,
"rewards/cosine_scaled_reward/std": 0.4510643184185028,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1484.0,
"completions/mean_length": 926.8125,
"completions/mean_terminated_length": 909.0159301757812,
"completions/min_length": 529.0,
"completions/min_terminated_length": 529.0,
"epoch": 0.19314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27474501729011536,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0,
"num_tokens": 16285653.0,
"reward": 1.4680557250976562,
"reward_std": 0.7384843826293945,
"rewards/cosine_scaled_reward/mean": 0.24184036254882812,
"rewards/cosine_scaled_reward/std": 0.5405412316322327,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 1003.953125,
"completions/mean_terminated_length": 854.8035888671875,
"completions/min_length": 354.0,
"completions/min_terminated_length": 354.0,
"epoch": 0.19428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30747127532958984,
"learning_rate": 1.6427471468404952e-07,
"loss": 0.0,
"num_tokens": 16359690.0,
"reward": 0.9851430654525757,
"reward_std": 0.3564821481704712,
"rewards/cosine_scaled_reward/mean": 0.055071547627449036,
"rewards/cosine_scaled_reward/std": 0.4447442889213562,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1212.328125,
"completions/mean_terminated_length": 910.0637817382812,
"completions/min_length": 542.0,
"completions/min_terminated_length": 542.0,
"epoch": 0.19542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3194407820701599,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0,
"num_tokens": 16447671.0,
"reward": 0.8521295785903931,
"reward_std": 0.6044571399688721,
"rewards/cosine_scaled_reward/mean": 0.04325229674577713,
"rewards/cosine_scaled_reward/std": 0.4702494442462921,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 973.09375,
"completions/mean_terminated_length": 882.0,
"completions/min_length": 240.0,
"completions/min_terminated_length": 240.0,
"epoch": 0.19657142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3858748972415924,
"learning_rate": 1.5642113178727193e-07,
"loss": -0.0,
"num_tokens": 16520565.0,
"reward": 1.4210284948349,
"reward_std": 0.6327061057090759,
"rewards/cosine_scaled_reward/mean": 0.21832676231861115,
"rewards/cosine_scaled_reward/std": 0.5338378548622131,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2041.0,
"completions/mean_length": 874.0,
"completions/mean_terminated_length": 816.2622680664062,
"completions/min_length": 277.0,
"completions/min_terminated_length": 277.0,
"epoch": 0.1977142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.34975388646125793,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0,
"num_tokens": 16587813.0,
"reward": 0.9026652574539185,
"reward_std": 0.7158900499343872,
"rewards/cosine_scaled_reward/mean": -0.025229886174201965,
"rewards/cosine_scaled_reward/std": 0.411268025636673,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1908.0,
"completions/mean_length": 946.546875,
"completions/mean_terminated_length": 911.01611328125,
"completions/min_length": 452.0,
"completions/min_terminated_length": 452.0,
"epoch": 0.19885714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3235276937484741,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.0,
"num_tokens": 16658728.0,
"reward": 0.9661835432052612,
"reward_std": 0.6674793362617493,
"rewards/cosine_scaled_reward/mean": -0.009095773100852966,
"rewards/cosine_scaled_reward/std": 0.4818039536476135,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1785.0,
"completions/mean_length": 946.375,
"completions/mean_terminated_length": 910.8386840820312,
"completions/min_length": 442.0,
"completions/min_terminated_length": 442.0,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3162003755569458,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0,
"num_tokens": 16730120.0,
"reward": 0.9043581485748291,
"reward_std": 0.41858798265457153,
"rewards/cosine_scaled_reward/mean": -0.040008433163166046,
"rewards/cosine_scaled_reward/std": 0.4500538408756256,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1066.109375,
"completions/mean_terminated_length": 964.5344848632812,
"completions/min_length": 299.0,
"completions/min_terminated_length": 299.0,
"epoch": 0.20114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33956244587898254,
"learning_rate": 1.4216149583350755e-07,
"loss": -0.0,
"num_tokens": 16809519.0,
"reward": 0.7081954479217529,
"reward_std": 0.5614209771156311,
"rewards/cosine_scaled_reward/mean": -0.11465225368738174,
"rewards/cosine_scaled_reward/std": 0.34775587916374207,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1808.0,
"completions/mean_length": 917.015625,
"completions/mean_terminated_length": 899.0635375976562,
"completions/min_length": 449.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.2022857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32397690415382385,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0,
"num_tokens": 16878432.0,
"reward": 1.0032364130020142,
"reward_std": 0.7183334827423096,
"rewards/cosine_scaled_reward/mean": 0.009430669248104095,
"rewards/cosine_scaled_reward/std": 0.4540334641933441,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1671.0,
"completions/mean_length": 1051.28125,
"completions/mean_terminated_length": 928.877197265625,
"completions/min_length": 384.0,
"completions/min_terminated_length": 384.0,
"epoch": 0.20342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32052552700042725,
"learning_rate": 1.3577281594640182e-07,
"loss": -0.0,
"num_tokens": 16957250.0,
"reward": 0.7599722146987915,
"reward_std": 0.612259566783905,
"rewards/cosine_scaled_reward/mean": -0.08095138520002365,
"rewards/cosine_scaled_reward/std": 0.3940528631210327,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1793.0,
"completions/mean_length": 1150.203125,
"completions/mean_terminated_length": 921.3529663085938,
"completions/min_length": 367.0,
"completions/min_terminated_length": 367.0,
"epoch": 0.20457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36251163482666016,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0,
"num_tokens": 17041695.0,
"reward": 0.5754084587097168,
"reward_std": 0.5908599495887756,
"rewards/cosine_scaled_reward/mean": -0.11854580044746399,
"rewards/cosine_scaled_reward/std": 0.32444170117378235,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1004.453125,
"completions/mean_terminated_length": 896.5,
"completions/min_length": 256.0,
"completions/min_terminated_length": 256.0,
"epoch": 0.2057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3620760142803192,
"learning_rate": 1.2988880807625927e-07,
"loss": -0.0,
"num_tokens": 17117156.0,
"reward": 1.403580904006958,
"reward_std": 0.9170527458190918,
"rewards/cosine_scaled_reward/mean": 0.2096029818058014,
"rewards/cosine_scaled_reward/std": 0.5256889462471008,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1716.0,
"completions/mean_length": 1117.15625,
"completions/mean_terminated_length": 984.1785888671875,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.20685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2816322147846222,
"learning_rate": 1.2713832064634125e-07,
"loss": -0.0,
"num_tokens": 17200126.0,
"reward": 0.8361604809761047,
"reward_std": 0.6540721654891968,
"rewards/cosine_scaled_reward/mean": -0.050669748336076736,
"rewards/cosine_scaled_reward/std": 0.44142434000968933,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 1210.0,
"completions/mean_terminated_length": 930.6666870117188,
"completions/min_length": 287.0,
"completions/min_terminated_length": 287.0,
"epoch": 0.208,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2651442885398865,
"learning_rate": 1.2451664098030743e-07,
"loss": 0.0,
"num_tokens": 17287358.0,
"reward": 0.62415611743927,
"reward_std": 0.6586728096008301,
"rewards/cosine_scaled_reward/mean": -0.10198444128036499,
"rewards/cosine_scaled_reward/std": 0.3847215175628662,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 861.359375,
"completions/mean_terminated_length": 823.0806274414062,
"completions/min_length": 362.0,
"completions/min_terminated_length": 362.0,
"epoch": 0.20914285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3385181427001953,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0,
"num_tokens": 17353101.0,
"reward": 1.0283212661743164,
"reward_std": 0.6364034414291382,
"rewards/cosine_scaled_reward/mean": 0.029785610735416412,
"rewards/cosine_scaled_reward/std": 0.42320308089256287,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1726.0,
"completions/mean_length": 1004.765625,
"completions/mean_terminated_length": 916.35595703125,
"completions/min_length": 174.0,
"completions/min_terminated_length": 174.0,
"epoch": 0.2102857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.30591627955436707,
"learning_rate": 1.1966285981663407e-07,
"loss": -0.0,
"num_tokens": 17428758.0,
"reward": 0.7365655899047852,
"reward_std": 0.4397754371166229,
"rewards/cosine_scaled_reward/mean": -0.12390469759702682,
"rewards/cosine_scaled_reward/std": 0.3771846890449524,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 976.359375,
"completions/mean_terminated_length": 959.3492431640625,
"completions/min_length": 319.0,
"completions/min_terminated_length": 319.0,
"epoch": 0.21142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3458598852157593,
"learning_rate": 1.1743223682775649e-07,
"loss": -0.0,
"num_tokens": 17501429.0,
"reward": 0.9240812063217163,
"reward_std": 0.5495443344116211,
"rewards/cosine_scaled_reward/mean": -0.03795938193798065,
"rewards/cosine_scaled_reward/std": 0.46237269043922424,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1951.0,
"completions/mean_length": 1122.5,
"completions/mean_terminated_length": 951.1111450195312,
"completions/min_length": 469.0,
"completions/min_terminated_length": 469.0,
"epoch": 0.21257142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.31067562103271484,
"learning_rate": 1.1533337816991931e-07,
"loss": -0.0,
"num_tokens": 17583965.0,
"reward": 0.8422703742980957,
"reward_std": 0.6076713800430298,
"rewards/cosine_scaled_reward/mean": -0.01636481285095215,
"rewards/cosine_scaled_reward/std": 0.4099017381668091,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1007.21875,
"completions/mean_terminated_length": 937.8333740234375,
"completions/min_length": 433.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.21371428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32321879267692566,
"learning_rate": 1.1336692317580158e-07,
"loss": -0.0,
"num_tokens": 17658851.0,
"reward": 0.7644214630126953,
"reward_std": 0.4659081697463989,
"rewards/cosine_scaled_reward/mean": -0.08653924614191055,
"rewards/cosine_scaled_reward/std": 0.38629019260406494,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 992.703125,
"completions/mean_terminated_length": 958.6612548828125,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.21485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.36401256918907166,
"learning_rate": 1.1153347084664419e-07,
"loss": -0.0,
"num_tokens": 17734184.0,
"reward": 0.5271173715591431,
"reward_std": 0.5396482348442078,
"rewards/cosine_scaled_reward/mean": -0.23644131422042847,
"rewards/cosine_scaled_reward/std": 0.34559664130210876,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 834.171875,
"completions/mean_terminated_length": 753.2500610351562,
"completions/min_length": 179.0,
"completions/min_terminated_length": 179.0,
"epoch": 0.216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.356674462556839,
"learning_rate": 1.0983357966978745e-07,
"loss": -0.0,
"num_tokens": 17796859.0,
"reward": 0.8302590847015381,
"reward_std": 0.6041134595870972,
"rewards/cosine_scaled_reward/mean": -0.07705795764923096,
"rewards/cosine_scaled_reward/std": 0.43526527285575867,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 974.890625,
"completions/mean_terminated_length": 940.274169921875,
"completions/min_length": 449.0,
"completions/min_terminated_length": 449.0,
"epoch": 0.21714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29584038257598877,
"learning_rate": 1.0826776744855121e-07,
"loss": 0.0,
"num_tokens": 17869020.0,
"reward": 1.0407956838607788,
"reward_std": 0.5199205875396729,
"rewards/cosine_scaled_reward/mean": 0.020397864282131195,
"rewards/cosine_scaled_reward/std": 0.4723619520664215,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1847.0,
"completions/mean_length": 910.59375,
"completions/mean_terminated_length": 892.5397338867188,
"completions/min_length": 259.0,
"completions/min_terminated_length": 259.0,
"epoch": 0.21828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33857786655426025,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0,
"num_tokens": 17937586.0,
"reward": 1.044985055923462,
"reward_std": 0.7015856504440308,
"rewards/cosine_scaled_reward/mean": 0.030305005609989166,
"rewards/cosine_scaled_reward/std": 0.4603172242641449,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.203125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2025.0,
"completions/mean_length": 1249.640625,
"completions/mean_terminated_length": 1046.1373291015625,
"completions/min_length": 404.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.21942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2704383134841919,
"learning_rate": 1.0554024673218806e-07,
"loss": -0.0,
"num_tokens": 18028859.0,
"reward": 0.5498086810112,
"reward_std": 0.5540546178817749,
"rewards/cosine_scaled_reward/mean": -0.17040817439556122,
"rewards/cosine_scaled_reward/std": 0.2906738519668579,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1859.0,
"completions/mean_length": 1235.921875,
"completions/mean_terminated_length": 1067.3773193359375,
"completions/min_length": 351.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.22057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3072021007537842,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0,
"num_tokens": 18119102.0,
"reward": 0.8473511338233948,
"reward_std": 0.7724316716194153,
"rewards/cosine_scaled_reward/mean": -0.021636933088302612,
"rewards/cosine_scaled_reward/std": 0.4808884263038635,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1306.71875,
"completions/mean_terminated_length": 1135.6539306640625,
"completions/min_length": 559.0,
"completions/min_terminated_length": 559.0,
"epoch": 0.22171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2672087252140045,
"learning_rate": 1.0335423176140511e-07,
"loss": 0.0,
"num_tokens": 18214092.0,
"reward": 0.8827314972877502,
"reward_std": 0.7681792974472046,
"rewards/cosine_scaled_reward/mean": 0.011678241193294525,
"rewards/cosine_scaled_reward/std": 0.48625898361206055,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1920.0,
"completions/mean_length": 1073.125,
"completions/mean_terminated_length": 1025.1802978515625,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.22285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2599698603153229,
"learning_rate": 1.0246514708427701e-07,
"loss": -0.0,
"num_tokens": 18293916.0,
"reward": 0.9670987129211426,
"reward_std": 0.7788794040679932,
"rewards/cosine_scaled_reward/mean": 0.006986856460571289,
"rewards/cosine_scaled_reward/std": 0.5052981972694397,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 1129.515625,
"completions/mean_terminated_length": 1034.5,
"completions/min_length": 356.0,
"completions/min_terminated_length": 356.0,
"epoch": 0.224,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.329662024974823,
"learning_rate": 1.017123858587145e-07,
"loss": -0.0,
"num_tokens": 18377797.0,
"reward": 0.9397312998771667,
"reward_std": 0.7938928604125977,
"rewards/cosine_scaled_reward/mean": 0.016740664839744568,
"rewards/cosine_scaled_reward/std": 0.4878515601158142,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1673.0,
"completions/mean_length": 845.515625,
"completions/mean_terminated_length": 806.7257690429688,
"completions/min_length": 359.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.22514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.35329359769821167,
"learning_rate": 1.0109617738307911e-07,
"loss": -0.0,
"num_tokens": 18442278.0,
"reward": 1.326704740524292,
"reward_std": 0.6592847108840942,
"rewards/cosine_scaled_reward/mean": 0.17116491496562958,
"rewards/cosine_scaled_reward/std": 0.5057182908058167,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1691.0,
"completions/mean_length": 1010.03125,
"completions/mean_terminated_length": 882.5614013671875,
"completions/min_length": 399.0,
"completions/min_terminated_length": 399.0,
"epoch": 0.22628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.32196998596191406,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0,
"num_tokens": 18518424.0,
"reward": 0.9607409238815308,
"reward_std": 0.6629819869995117,
"rewards/cosine_scaled_reward/mean": -0.004004567861557007,
"rewards/cosine_scaled_reward/std": 0.4992019534111023,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 1223.03125,
"completions/mean_terminated_length": 1121.7193603515625,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.22742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2712598741054535,
"learning_rate": 1.002741278414069e-07,
"loss": 0.0,
"num_tokens": 18608202.0,
"reward": 0.7551803588867188,
"reward_std": 0.5998207330703735,
"rewards/cosine_scaled_reward/mean": -0.11459730565547943,
"rewards/cosine_scaled_reward/std": 0.3166539669036865,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1882.0,
"completions/mean_length": 974.171875,
"completions/mean_terminated_length": 921.360595703125,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.22857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.285355806350708,
"learning_rate": 1.0006853717962393e-07,
"loss": -0.0,
"num_tokens": 18680669.0,
"reward": 1.1672099828720093,
"reward_std": 0.7903769016265869,
"rewards/cosine_scaled_reward/mean": 0.09922999888658524,
"rewards/cosine_scaled_reward/std": 0.5049266219139099,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": -1.9138678908348085e-09,
"train_runtime": 10259.1504,
"train_samples_per_second": 1.248,
"train_steps_per_second": 0.019
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 18680669,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}