TT_L0.2_H0.2_dr_grpo / trainer_state.json
LLucass's picture
Model save
aa5f5a3 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.22857142857142856,
"eval_steps": 500,
"global_step": 200,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20054349303245544,
"learning_rate": 0.0,
"loss": 0.0427,
"num_tokens": 118418.0,
"reward": 0.17899775505065918,
"reward_std": 0.7650213241577148,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19502359628677368,
"learning_rate": 5e-08,
"loss": 0.0561,
"num_tokens": 239748.0,
"reward": 0.3848632574081421,
"reward_std": 0.9111153483390808,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1287.0,
"completions/mean_length": 1944.453125,
"completions/mean_terminated_length": 943.5,
"completions/min_length": 608.0,
"completions/min_terminated_length": 608.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.230765700340271,
"learning_rate": 1e-07,
"loss": 0.0549,
"num_tokens": 374665.0,
"reward": -0.28856638073921204,
"reward_std": 0.4003669023513794,
"rewards/cosine_scaled_reward/mean": -0.19897069036960602,
"rewards/cosine_scaled_reward/std": 0.18252794444561005,
"rewards/format_reward/mean": 0.109375,
"rewards/format_reward/std": 0.3145764470100403,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1871.0,
"completions/mean_length": 1592.3125,
"completions/mean_terminated_length": 1006.4285888671875,
"completions/min_length": 450.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20995420217514038,
"learning_rate": 1.5e-07,
"loss": 0.1266,
"num_tokens": 486381.0,
"reward": 0.20640414953231812,
"reward_std": 0.8193759918212891,
"rewards/cosine_scaled_reward/mean": -0.13117292523384094,
"rewards/cosine_scaled_reward/std": 0.35454094409942627,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1680.0,
"completions/mean_length": 2002.859375,
"completions/mean_terminated_length": 1085.0,
"completions/min_length": 755.0,
"completions/min_terminated_length": 755.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23816199600696564,
"learning_rate": 2e-07,
"loss": 0.01,
"num_tokens": 625380.0,
"reward": -0.41131818294525146,
"reward_std": 0.30660682916641235,
"rewards/cosine_scaled_reward/mean": -0.24472159147262573,
"rewards/cosine_scaled_reward/std": 0.19079075753688812,
"rewards/format_reward/mean": 0.078125,
"rewards/format_reward/std": 0.27048972249031067,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1344.0,
"completions/mean_length": 1890.0,
"completions/mean_terminated_length": 784.0,
"completions/min_length": 440.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24285951256752014,
"learning_rate": 2.5e-07,
"loss": -0.0119,
"num_tokens": 757988.0,
"reward": -0.24828195571899414,
"reward_std": 0.3839319050312042,
"rewards/cosine_scaled_reward/mean": -0.19445347785949707,
"rewards/cosine_scaled_reward/std": 0.19692479074001312,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1941.0,
"completions/mean_length": 1935.046875,
"completions/mean_terminated_length": 1390.8182373046875,
"completions/min_length": 886.0,
"completions/min_terminated_length": 886.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2183438539505005,
"learning_rate": 3e-07,
"loss": 0.0412,
"num_tokens": 892239.0,
"reward": -0.07044821977615356,
"reward_std": 0.5991545915603638,
"rewards/cosine_scaled_reward/mean": -0.14459910988807678,
"rewards/cosine_scaled_reward/std": 0.3703240156173706,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 1743.921875,
"completions/mean_terminated_length": 966.8333129882812,
"completions/min_length": 296.0,
"completions/min_terminated_length": 296.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18490855395793915,
"learning_rate": 3.5e-07,
"loss": 0.0096,
"num_tokens": 1014266.0,
"reward": 0.07391861081123352,
"reward_std": 0.5062483549118042,
"rewards/cosine_scaled_reward/mean": -0.11929068714380264,
"rewards/cosine_scaled_reward/std": 0.4095526933670044,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1790.0,
"completions/mean_length": 1965.46875,
"completions/mean_terminated_length": 1461.111083984375,
"completions/min_length": 1029.0,
"completions/min_terminated_length": 1029.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21707069873809814,
"learning_rate": 4e-07,
"loss": 0.0566,
"num_tokens": 1151512.0,
"reward": -0.15350507199764252,
"reward_std": 0.7245944738388062,
"rewards/cosine_scaled_reward/mean": -0.18612754344940186,
"rewards/cosine_scaled_reward/std": 0.30883485078811646,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1745.0,
"completions/mean_length": 1682.59375,
"completions/mean_terminated_length": 817.1578979492188,
"completions/min_length": 394.0,
"completions/min_terminated_length": 394.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20094214379787445,
"learning_rate": 4.5e-07,
"loss": 0.0457,
"num_tokens": 1270030.0,
"reward": 0.027805477380752563,
"reward_std": 0.4805509150028229,
"rewards/cosine_scaled_reward/mean": -0.14234726130962372,
"rewards/cosine_scaled_reward/std": 0.26565250754356384,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.953125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1094.0,
"completions/mean_length": 1998.15625,
"completions/mean_terminated_length": 984.6666870117188,
"completions/min_length": 798.0,
"completions/min_terminated_length": 798.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2170705795288086,
"learning_rate": 5e-07,
"loss": 0.0247,
"num_tokens": 1409584.0,
"reward": -0.43332377076148987,
"reward_std": 0.36288702487945557,
"rewards/cosine_scaled_reward/mean": -0.24791188538074493,
"rewards/cosine_scaled_reward/std": 0.17533892393112183,
"rewards/format_reward/mean": 0.0625,
"rewards/format_reward/std": 0.24397502839565277,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2044.0,
"completions/mean_length": 1630.375,
"completions/mean_terminated_length": 1093.4285888671875,
"completions/min_length": 427.0,
"completions/min_terminated_length": 427.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2160935252904892,
"learning_rate": 5.5e-07,
"loss": 0.0753,
"num_tokens": 1524872.0,
"reward": 0.0067175328731536865,
"reward_std": 0.689138650894165,
"rewards/cosine_scaled_reward/mean": -0.22320374846458435,
"rewards/cosine_scaled_reward/std": 0.3645767867565155,
"rewards/format_reward/mean": 0.453125,
"rewards/format_reward/std": 0.501733124256134,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 1833.453125,
"completions/mean_terminated_length": 1067.21435546875,
"completions/min_length": 616.0,
"completions/min_terminated_length": 616.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2122364640235901,
"learning_rate": 6e-07,
"loss": 0.0326,
"num_tokens": 1653253.0,
"reward": -0.09265299141407013,
"reward_std": 0.5985201001167297,
"rewards/cosine_scaled_reward/mean": -0.17913900315761566,
"rewards/cosine_scaled_reward/std": 0.306300550699234,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1897.0,
"completions/mean_length": 1823.40625,
"completions/mean_terminated_length": 1202.4705810546875,
"completions/min_length": 605.0,
"completions/min_terminated_length": 605.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2076576203107834,
"learning_rate": 6.5e-07,
"loss": 0.0261,
"num_tokens": 1780559.0,
"reward": 0.005522748455405235,
"reward_std": 0.7086418867111206,
"rewards/cosine_scaled_reward/mean": -0.1378636360168457,
"rewards/cosine_scaled_reward/std": 0.35400503873825073,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1328.0,
"completions/mean_length": 1698.171875,
"completions/mean_terminated_length": 731.0,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1969502866268158,
"learning_rate": 7e-07,
"loss": 0.0216,
"num_tokens": 1900162.0,
"reward": 0.2789269685745239,
"reward_std": 0.43547046184539795,
"rewards/cosine_scaled_reward/mean": -0.00897398591041565,
"rewards/cosine_scaled_reward/std": 0.4515364170074463,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23300249874591827,
"learning_rate": 7.5e-07,
"loss": -0.0,
"num_tokens": 2041674.0,
"reward": -0.5078557729721069,
"reward_std": 0.3458974361419678,
"rewards/cosine_scaled_reward/mean": -0.25392788648605347,
"rewards/cosine_scaled_reward/std": 0.18378609418869019,
"rewards/format_reward/mean": 0.0,
"rewards/format_reward/std": 0.0,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1563.734375,
"completions/mean_terminated_length": 941.107177734375,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20892462134361267,
"learning_rate": 8e-07,
"loss": 0.0477,
"num_tokens": 2152273.0,
"reward": 0.3328002989292145,
"reward_std": 0.7669951319694519,
"rewards/cosine_scaled_reward/mean": -0.06797486543655396,
"rewards/cosine_scaled_reward/std": 0.4412795305252075,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1465.0,
"completions/mean_length": 1778.90625,
"completions/mean_terminated_length": 899.86669921875,
"completions/min_length": 535.0,
"completions/min_terminated_length": 535.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19322611391544342,
"learning_rate": 8.499999999999999e-07,
"loss": 0.0726,
"num_tokens": 2276499.0,
"reward": -0.18389344215393066,
"reward_std": 0.5934990644454956,
"rewards/cosine_scaled_reward/mean": -0.23257172107696533,
"rewards/cosine_scaled_reward/std": 0.256833553314209,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1771.0,
"completions/mean_length": 1869.53125,
"completions/mean_terminated_length": 1232.1429443359375,
"completions/min_length": 711.0,
"completions/min_terminated_length": 711.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21417103707790375,
"learning_rate": 9e-07,
"loss": 0.0378,
"num_tokens": 2407405.0,
"reward": -0.05162222683429718,
"reward_std": 0.7635236978530884,
"rewards/cosine_scaled_reward/mean": -0.158623605966568,
"rewards/cosine_scaled_reward/std": 0.4003170132637024,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 1572.90625,
"completions/mean_terminated_length": 878.5385131835938,
"completions/min_length": 369.0,
"completions/min_terminated_length": 369.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1591554582118988,
"learning_rate": 9.499999999999999e-07,
"loss": 0.0507,
"num_tokens": 2519423.0,
"reward": 0.2816518545150757,
"reward_std": 0.7381908893585205,
"rewards/cosine_scaled_reward/mean": -0.07011157274246216,
"rewards/cosine_scaled_reward/std": 0.35477158427238464,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 1776.28125,
"completions/mean_terminated_length": 1081.888916015625,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22487252950668335,
"learning_rate": 1e-06,
"loss": 0.0137,
"num_tokens": 2643913.0,
"reward": -0.0122755765914917,
"reward_std": 0.4569401443004608,
"rewards/cosine_scaled_reward/mean": -0.16238778829574585,
"rewards/cosine_scaled_reward/std": 0.3900769054889679,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1851.0,
"completions/mean_length": 1273.1875,
"completions/mean_terminated_length": 776.5128173828125,
"completions/min_length": 242.0,
"completions/min_terminated_length": 242.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1901247799396515,
"learning_rate": 9.99931462820376e-07,
"loss": -0.0442,
"num_tokens": 2734413.0,
"reward": 0.5235691666603088,
"reward_std": 0.4210290312767029,
"rewards/cosine_scaled_reward/mean": -0.07415291666984558,
"rewards/cosine_scaled_reward/std": 0.40765848755836487,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 1640.84375,
"completions/mean_terminated_length": 1082.888916015625,
"completions/min_length": 363.0,
"completions/min_terminated_length": 363.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21930935978889465,
"learning_rate": 9.997258721585931e-07,
"loss": 0.0518,
"num_tokens": 2850219.0,
"reward": 0.23656107485294342,
"reward_std": 0.6851356029510498,
"rewards/cosine_scaled_reward/mean": -0.10046947002410889,
"rewards/cosine_scaled_reward/std": 0.45323267579078674,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 1785.265625,
"completions/mean_terminated_length": 1113.8333740234375,
"completions/min_length": 475.0,
"completions/min_terminated_length": 475.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.196747824549675,
"learning_rate": 9.993832906395582e-07,
"loss": 0.0687,
"num_tokens": 2975404.0,
"reward": 0.04860962927341461,
"reward_std": 0.8576602935791016,
"rewards/cosine_scaled_reward/mean": -0.1475701779127121,
"rewards/cosine_scaled_reward/std": 0.4082482159137726,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1752.0,
"completions/mean_length": 1695.234375,
"completions/mean_terminated_length": 919.1500244140625,
"completions/min_length": 502.0,
"completions/min_terminated_length": 502.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22251193225383759,
"learning_rate": 9.989038226169207e-07,
"loss": 0.0401,
"num_tokens": 3094195.0,
"reward": 0.2244701385498047,
"reward_std": 0.6461865901947021,
"rewards/cosine_scaled_reward/mean": -0.06745242327451706,
"rewards/cosine_scaled_reward/std": 0.41534900665283203,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1974.4375,
"completions/mean_terminated_length": 1524.888916015625,
"completions/min_length": 1105.0,
"completions/min_terminated_length": 1105.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23350541293621063,
"learning_rate": 9.982876141412855e-07,
"loss": 0.0101,
"num_tokens": 3231191.0,
"reward": 0.16762161254882812,
"reward_std": 0.5227605104446411,
"rewards/cosine_scaled_reward/mean": -0.041189197450876236,
"rewards/cosine_scaled_reward/std": 0.37332749366760254,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 1915.5,
"completions/mean_terminated_length": 1277.0909423828125,
"completions/min_length": 554.0,
"completions/min_terminated_length": 554.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21174418926239014,
"learning_rate": 9.975348529157229e-07,
"loss": 0.0312,
"num_tokens": 3364071.0,
"reward": -0.18293717503547668,
"reward_std": 0.5386844873428345,
"rewards/cosine_scaled_reward/mean": -0.20865610241889954,
"rewards/cosine_scaled_reward/std": 0.2562413811683655,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2007.0,
"completions/mean_length": 1815.140625,
"completions/mean_terminated_length": 1220.0555419921875,
"completions/min_length": 445.0,
"completions/min_terminated_length": 445.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.213092640042305,
"learning_rate": 9.96645768238595e-07,
"loss": 0.0361,
"num_tokens": 3490576.0,
"reward": 0.04266031086444855,
"reward_std": 0.776748776435852,
"rewards/cosine_scaled_reward/mean": -0.13491985201835632,
"rewards/cosine_scaled_reward/std": 0.37269750237464905,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 1906.15625,
"completions/mean_terminated_length": 1039.3333740234375,
"completions/min_length": 633.0,
"completions/min_terminated_length": 633.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22322852909564972,
"learning_rate": 9.956206309337066e-07,
"loss": 0.0389,
"num_tokens": 3623042.0,
"reward": -0.1004815474152565,
"reward_std": 0.539789080619812,
"rewards/cosine_scaled_reward/mean": -0.12836576998233795,
"rewards/cosine_scaled_reward/std": 0.28681084513664246,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1854.1875,
"completions/mean_terminated_length": 1162.0,
"completions/min_length": 592.0,
"completions/min_terminated_length": 592.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2063974291086197,
"learning_rate": 9.944597532678119e-07,
"loss": 0.0115,
"num_tokens": 3752246.0,
"reward": -0.030107807368040085,
"reward_std": 0.6322507858276367,
"rewards/cosine_scaled_reward/mean": -0.1634913980960846,
"rewards/cosine_scaled_reward/std": 0.31110286712646484,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1523.0,
"completions/mean_length": 1841.15625,
"completions/mean_terminated_length": 724.2000122070312,
"completions/min_length": 165.0,
"completions/min_terminated_length": 165.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2025275081396103,
"learning_rate": 9.931634888554935e-07,
"loss": 0.0143,
"num_tokens": 3880576.0,
"reward": -0.34719598293304443,
"reward_std": 0.5259275436401367,
"rewards/cosine_scaled_reward/mean": -0.2595354914665222,
"rewards/cosine_scaled_reward/std": 0.24079306423664093,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 1945.65625,
"completions/mean_terminated_length": 1393.0,
"completions/min_length": 899.0,
"completions/min_terminated_length": 899.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22421319782733917,
"learning_rate": 9.917322325514487e-07,
"loss": 0.0542,
"num_tokens": 4015450.0,
"reward": -0.2238868921995163,
"reward_std": 0.6127103567123413,
"rewards/cosine_scaled_reward/mean": -0.20569345355033875,
"rewards/cosine_scaled_reward/std": 0.26141345500946045,
"rewards/format_reward/mean": 0.1875,
"rewards/format_reward/std": 0.39339789748191833,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1959.0,
"completions/mean_length": 1976.890625,
"completions/mean_terminated_length": 1289.5,
"completions/min_length": 581.0,
"completions/min_terminated_length": 581.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2219865769147873,
"learning_rate": 9.901664203302124e-07,
"loss": 0.0139,
"num_tokens": 4153187.0,
"reward": -0.5050230026245117,
"reward_std": 0.38754361867904663,
"rewards/cosine_scaled_reward/mean": -0.31501150131225586,
"rewards/cosine_scaled_reward/std": 0.19765734672546387,
"rewards/format_reward/mean": 0.125,
"rewards/format_reward/std": 0.3333333432674408,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1573.625,
"completions/mean_terminated_length": 1068.6451416015625,
"completions/min_length": 517.0,
"completions/min_terminated_length": 517.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23140408098697662,
"learning_rate": 9.88466529153356e-07,
"loss": 0.0697,
"num_tokens": 4263451.0,
"reward": 0.3802332282066345,
"reward_std": 0.8625352382659912,
"rewards/cosine_scaled_reward/mean": -0.05207090824842453,
"rewards/cosine_scaled_reward/std": 0.4423771798610687,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.5037065148353577,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1957.0,
"completions/mean_length": 1814.875,
"completions/mean_terminated_length": 1053.3333740234375,
"completions/min_length": 359.0,
"completions/min_terminated_length": 359.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21747314929962158,
"learning_rate": 9.866330768241983e-07,
"loss": 0.0288,
"num_tokens": 4391099.0,
"reward": 0.11022068560123444,
"reward_std": 0.898347795009613,
"rewards/cosine_scaled_reward/mean": -0.08551465719938278,
"rewards/cosine_scaled_reward/std": 0.4119128882884979,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1775.0,
"completions/mean_length": 1976.765625,
"completions/mean_terminated_length": 1288.166748046875,
"completions/min_length": 964.0,
"completions/min_terminated_length": 964.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23834578692913055,
"learning_rate": 9.846666218300807e-07,
"loss": 0.021,
"num_tokens": 4528724.0,
"reward": -0.38736510276794434,
"reward_std": 0.5356569290161133,
"rewards/cosine_scaled_reward/mean": -0.24837006628513336,
"rewards/cosine_scaled_reward/std": 0.23275430500507355,
"rewards/format_reward/mean": 0.109375,
"rewards/format_reward/std": 0.3145764470100403,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2047.0,
"completions/mean_length": 2013.9375,
"completions/mean_terminated_length": 1503.0,
"completions/min_length": 1027.0,
"completions/min_terminated_length": 1027.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22654284536838531,
"learning_rate": 9.825677631722435e-07,
"loss": 0.0142,
"num_tokens": 4668640.0,
"reward": -0.42377781867980957,
"reward_std": 0.379480242729187,
"rewards/cosine_scaled_reward/mean": -0.2665764391422272,
"rewards/cosine_scaled_reward/std": 0.18001720309257507,
"rewards/format_reward/mean": 0.109375,
"rewards/format_reward/std": 0.3145764470100403,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1254.0,
"completions/mean_length": 1912.265625,
"completions/mean_terminated_length": 1082.77783203125,
"completions/min_length": 920.0,
"completions/min_terminated_length": 920.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22495149075984955,
"learning_rate": 9.80337140183366e-07,
"loss": 0.0353,
"num_tokens": 4802737.0,
"reward": -0.15185467898845673,
"reward_std": 0.38927191495895386,
"rewards/cosine_scaled_reward/mean": -0.14623984694480896,
"rewards/cosine_scaled_reward/std": 0.32866883277893066,
"rewards/format_reward/mean": 0.140625,
"rewards/format_reward/std": 0.3503824472427368,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1979.0,
"completions/mean_length": 1678.4375,
"completions/mean_terminated_length": 656.7058715820312,
"completions/min_length": 300.0,
"completions/min_terminated_length": 300.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19435559213161469,
"learning_rate": 9.779754323328192e-07,
"loss": 0.0416,
"num_tokens": 4920941.0,
"reward": 0.17510981857776642,
"reward_std": 0.559760570526123,
"rewards/cosine_scaled_reward/mean": -0.0765075832605362,
"rewards/cosine_scaled_reward/std": 0.3369429409503937,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1762.0,
"completions/mean_length": 1682.25,
"completions/mean_terminated_length": 877.6000366210938,
"completions/min_length": 465.0,
"completions/min_terminated_length": 465.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19440439343452454,
"learning_rate": 9.754833590196926e-07,
"loss": 0.0685,
"num_tokens": 5038677.0,
"reward": 0.09382888674736023,
"reward_std": 0.4140171706676483,
"rewards/cosine_scaled_reward/mean": -0.12496057152748108,
"rewards/cosine_scaled_reward/std": 0.3649806082248688,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1905.0,
"completions/mean_length": 1841.0625,
"completions/mean_terminated_length": 1417.3333740234375,
"completions/min_length": 965.0,
"completions/min_terminated_length": 965.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21685408055782318,
"learning_rate": 9.728616793536587e-07,
"loss": 0.0105,
"num_tokens": 5167657.0,
"reward": -0.15476089715957642,
"reward_std": 0.5854519605636597,
"rewards/cosine_scaled_reward/mean": -0.2648804187774658,
"rewards/cosine_scaled_reward/std": 0.26939424872398376,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1933.0,
"completions/mean_length": 1695.0,
"completions/mean_terminated_length": 719.058837890625,
"completions/min_length": 205.0,
"completions/min_terminated_length": 205.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23794473707675934,
"learning_rate": 9.701111919237408e-07,
"loss": 0.0327,
"num_tokens": 5286497.0,
"reward": -0.2923233211040497,
"reward_std": 0.36149862408638,
"rewards/cosine_scaled_reward/mean": -0.27897417545318604,
"rewards/cosine_scaled_reward/std": 0.17192503809928894,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1793.84375,
"completions/mean_terminated_length": 1031.375,
"completions/min_length": 714.0,
"completions/min_terminated_length": 714.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21354877948760986,
"learning_rate": 9.672327345550543e-07,
"loss": 0.0597,
"num_tokens": 5412919.0,
"reward": -0.0004070103168487549,
"reward_std": 0.5297929048538208,
"rewards/cosine_scaled_reward/mean": -0.12520350515842438,
"rewards/cosine_scaled_reward/std": 0.3128352463245392,
"rewards/format_reward/mean": 0.25,
"rewards/format_reward/std": 0.4364357888698578,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1729.0,
"completions/mean_length": 1651.1875,
"completions/mean_terminated_length": 778.2000122070312,
"completions/min_length": 251.0,
"completions/min_terminated_length": 251.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20100052654743195,
"learning_rate": 9.64227184053598e-07,
"loss": 0.0089,
"num_tokens": 5529291.0,
"reward": 0.13101597130298615,
"reward_std": 0.5976744890213013,
"rewards/cosine_scaled_reward/mean": -0.09855452179908752,
"rewards/cosine_scaled_reward/std": 0.46046286821365356,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1953.0,
"completions/mean_length": 2011.765625,
"completions/mean_terminated_length": 1584.2000732421875,
"completions/min_length": 1146.0,
"completions/min_terminated_length": 1146.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2144525796175003,
"learning_rate": 9.610954559391704e-07,
"loss": 0.0108,
"num_tokens": 5669700.0,
"reward": -0.15992262959480286,
"reward_std": 0.5183610916137695,
"rewards/cosine_scaled_reward/mean": -0.14246131479740143,
"rewards/cosine_scaled_reward/std": 0.37169432640075684,
"rewards/format_reward/mean": 0.125,
"rewards/format_reward/std": 0.3333333432674408,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1566.0,
"completions/mean_length": 1811.21875,
"completions/mean_terminated_length": 965.5714721679688,
"completions/min_length": 578.0,
"completions/min_terminated_length": 578.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2243409901857376,
"learning_rate": 9.578385041664925e-07,
"loss": 0.0324,
"num_tokens": 5796786.0,
"reward": -0.2682954668998718,
"reward_std": 0.47855472564697266,
"rewards/cosine_scaled_reward/mean": -0.2435227334499359,
"rewards/cosine_scaled_reward/std": 0.21708372235298157,
"rewards/format_reward/mean": 0.21875,
"rewards/format_reward/std": 0.4166666865348816,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1730.0,
"completions/mean_length": 1720.15625,
"completions/mean_terminated_length": 882.3333129882812,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1925242692232132,
"learning_rate": 9.54457320834625e-07,
"loss": 0.0641,
"num_tokens": 5917276.0,
"reward": -0.03124237060546875,
"reward_std": 0.6693180203437805,
"rewards/cosine_scaled_reward/mean": -0.17968368530273438,
"rewards/cosine_scaled_reward/std": 0.379862517118454,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1723.734375,
"completions/mean_terminated_length": 1059.761962890625,
"completions/min_length": 617.0,
"completions/min_terminated_length": 617.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20502391457557678,
"learning_rate": 9.509529358847654e-07,
"loss": 0.0544,
"num_tokens": 6038139.0,
"reward": 0.21815460920333862,
"reward_std": 0.6701791286468506,
"rewards/cosine_scaled_reward/mean": -0.05498518794775009,
"rewards/cosine_scaled_reward/std": 0.42852458357810974,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1571.0,
"completions/mean_length": 1450.3125,
"completions/mean_terminated_length": 814.0645141601562,
"completions/min_length": 275.0,
"completions/min_terminated_length": 275.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18392837047576904,
"learning_rate": 9.473264167865171e-07,
"loss": 0.0771,
"num_tokens": 6141023.0,
"reward": 0.20156216621398926,
"reward_std": 0.7049944400787354,
"rewards/cosine_scaled_reward/mean": -0.14921891689300537,
"rewards/cosine_scaled_reward/std": 0.35212206840515137,
"rewards/format_reward/mean": 0.5,
"rewards/format_reward/std": 0.5039526224136353,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1605.0,
"completions/mean_length": 1740.84375,
"completions/mean_terminated_length": 643.857177734375,
"completions/min_length": 307.0,
"completions/min_terminated_length": 307.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1936425119638443,
"learning_rate": 9.43578868212728e-07,
"loss": 0.0292,
"num_tokens": 6263253.0,
"reward": -0.08827750384807587,
"reward_std": 0.3788633346557617,
"rewards/cosine_scaled_reward/mean": -0.16132624447345734,
"rewards/cosine_scaled_reward/std": 0.36572694778442383,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 50
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1543.0,
"completions/mean_length": 1443.46875,
"completions/mean_terminated_length": 758.3333740234375,
"completions/min_length": 398.0,
"completions/min_terminated_length": 398.0,
"epoch": 0.05828571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19979657232761383,
"learning_rate": 9.397114317029974e-07,
"loss": 0.0422,
"num_tokens": 6365843.0,
"reward": 0.09200635552406311,
"reward_std": 0.5713317394256592,
"rewards/cosine_scaled_reward/mean": -0.18837183713912964,
"rewards/cosine_scaled_reward/std": 0.28224873542785645,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 51
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1883.0,
"completions/mean_length": 1713.5,
"completions/mean_terminated_length": 788.7058715820312,
"completions/min_length": 282.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.05942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2033102959394455,
"learning_rate": 9.357252853159505e-07,
"loss": 0.0054,
"num_tokens": 6486859.0,
"reward": -0.013391643762588501,
"reward_std": 0.41247767210006714,
"rewards/cosine_scaled_reward/mean": -0.14732082188129425,
"rewards/cosine_scaled_reward/std": 0.3900400698184967,
"rewards/format_reward/mean": 0.28125,
"rewards/format_reward/std": 0.4531635046005249,
"step": 52
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1233.0,
"completions/mean_length": 1724.828125,
"completions/mean_terminated_length": 669.1333618164062,
"completions/min_length": 400.0,
"completions/min_terminated_length": 400.0,
"epoch": 0.060571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20569661259651184,
"learning_rate": 9.316216432703916e-07,
"loss": 0.0366,
"num_tokens": 6608080.0,
"reward": -0.0075955986976623535,
"reward_std": 0.6965757012367249,
"rewards/cosine_scaled_reward/mean": -0.15223531424999237,
"rewards/cosine_scaled_reward/std": 0.32304710149765015,
"rewards/format_reward/mean": 0.296875,
"rewards/format_reward/std": 0.4604927599430084,
"step": 53
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 1761.859375,
"completions/mean_terminated_length": 1132.3499755859375,
"completions/min_length": 602.0,
"completions/min_terminated_length": 602.0,
"epoch": 0.061714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19333088397979736,
"learning_rate": 9.274017555754407e-07,
"loss": 0.0673,
"num_tokens": 6731983.0,
"reward": 0.35599154233932495,
"reward_std": 1.0488793849945068,
"rewards/cosine_scaled_reward/mean": -0.05637925863265991,
"rewards/cosine_scaled_reward/std": 0.46367430686950684,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 54
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1820.0,
"completions/mean_length": 1711.0,
"completions/mean_terminated_length": 969.6000366210938,
"completions/min_length": 568.0,
"completions/min_terminated_length": 568.0,
"epoch": 0.06285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17474976181983948,
"learning_rate": 9.230669076497687e-07,
"loss": 0.0316,
"num_tokens": 6851775.0,
"reward": 0.20368073880672455,
"reward_std": 0.4746112525463104,
"rewards/cosine_scaled_reward/mean": -0.05440961569547653,
"rewards/cosine_scaled_reward/std": 0.4434376358985901,
"rewards/format_reward/mean": 0.3125,
"rewards/format_reward/std": 0.467176616191864,
"step": 55
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1844.0,
"completions/mean_length": 1733.625,
"completions/mean_terminated_length": 1042.0,
"completions/min_length": 534.0,
"completions/min_terminated_length": 534.0,
"epoch": 0.064,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19868570566177368,
"learning_rate": 9.186184199300463e-07,
"loss": 0.0503,
"num_tokens": 6973687.0,
"reward": 0.19238728284835815,
"reward_std": 0.5642611980438232,
"rewards/cosine_scaled_reward/mean": -0.10693138092756271,
"rewards/cosine_scaled_reward/std": 0.48442336916923523,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 56
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1998.0,
"completions/mean_length": 1957.140625,
"completions/mean_terminated_length": 1401.888916015625,
"completions/min_length": 919.0,
"completions/min_terminated_length": 919.0,
"epoch": 0.06514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1979535073041916,
"learning_rate": 9.140576474687263e-07,
"loss": 0.0417,
"num_tokens": 7110512.0,
"reward": -0.4044339656829834,
"reward_std": 0.4291505217552185,
"rewards/cosine_scaled_reward/mean": -0.3037794828414917,
"rewards/cosine_scaled_reward/std": 0.17916814982891083,
"rewards/format_reward/mean": 0.203125,
"rewards/format_reward/std": 0.40550529956817627,
"step": 57
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 1508.109375,
"completions/mean_terminated_length": 968.21875,
"completions/min_length": 367.0,
"completions/min_terminated_length": 367.0,
"epoch": 0.06628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17342056334018707,
"learning_rate": 9.093859795212817e-07,
"loss": 0.0248,
"num_tokens": 7217127.0,
"reward": 0.6014055013656616,
"reward_std": 0.8353673219680786,
"rewards/cosine_scaled_reward/mean": -0.019609764218330383,
"rewards/cosine_scaled_reward/std": 0.4545621871948242,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 58
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1806.0,
"completions/mean_length": 1733.078125,
"completions/mean_terminated_length": 1088.2381591796875,
"completions/min_length": 541.0,
"completions/min_terminated_length": 541.0,
"epoch": 0.06742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18201254308223724,
"learning_rate": 9.046048391230247e-07,
"loss": 0.0086,
"num_tokens": 7338508.0,
"reward": 0.13016025722026825,
"reward_std": 0.5339452624320984,
"rewards/cosine_scaled_reward/mean": -0.11460737138986588,
"rewards/cosine_scaled_reward/std": 0.40606313943862915,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 59
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1347.0,
"completions/mean_length": 1773.171875,
"completions/mean_terminated_length": 948.6875,
"completions/min_length": 635.0,
"completions/min_terminated_length": 635.0,
"epoch": 0.06857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17932923138141632,
"learning_rate": 8.997156826556369e-07,
"loss": 0.0516,
"num_tokens": 7462879.0,
"reward": -0.2482871562242508,
"reward_std": 0.4085908830165863,
"rewards/cosine_scaled_reward/mean": -0.2569561004638672,
"rewards/cosine_scaled_reward/std": 0.2272651493549347,
"rewards/format_reward/mean": 0.265625,
"rewards/format_reward/std": 0.44515693187713623,
"step": 60
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1726.890625,
"completions/mean_terminated_length": 966.368408203125,
"completions/min_length": 470.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.06971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20350487530231476,
"learning_rate": 8.9471999940354e-07,
"loss": 0.0252,
"num_tokens": 7584920.0,
"reward": 0.2052871733903885,
"reward_std": 0.7419347763061523,
"rewards/cosine_scaled_reward/mean": -0.10829392075538635,
"rewards/cosine_scaled_reward/std": 0.31667017936706543,
"rewards/format_reward/mean": 0.421875,
"rewards/format_reward/std": 0.49776285886764526,
"step": 61
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1668.0,
"completions/mean_length": 1549.28125,
"completions/mean_terminated_length": 865.8518676757812,
"completions/min_length": 450.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.07085714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17975586652755737,
"learning_rate": 8.896193111002475e-07,
"loss": 0.049,
"num_tokens": 7694306.0,
"reward": 0.3729054629802704,
"reward_std": 0.6217197775840759,
"rewards/cosine_scaled_reward/mean": -0.055734772235155106,
"rewards/cosine_scaled_reward/std": 0.3869990408420563,
"rewards/format_reward/mean": 0.484375,
"rewards/format_reward/std": 0.5037065148353577,
"step": 62
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1891.0,
"completions/mean_length": 1506.75,
"completions/mean_terminated_length": 1136.4210205078125,
"completions/min_length": 479.0,
"completions/min_terminated_length": 479.0,
"epoch": 0.072,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19678133726119995,
"learning_rate": 8.844151714648274e-07,
"loss": 0.0991,
"num_tokens": 7800994.0,
"reward": 0.5956183671951294,
"reward_std": 0.7010378837585449,
"rewards/cosine_scaled_reward/mean": -0.0068783238530159,
"rewards/cosine_scaled_reward/std": 0.4637373983860016,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 63
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1813.0,
"completions/mean_length": 1494.484375,
"completions/mean_terminated_length": 974.5151977539062,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.07314285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19099701941013336,
"learning_rate": 8.791091657286267e-07,
"loss": 0.0732,
"num_tokens": 7907593.0,
"reward": 0.28888779878616333,
"reward_std": 0.6505820751190186,
"rewards/cosine_scaled_reward/mean": -0.13680610060691833,
"rewards/cosine_scaled_reward/std": 0.36594465374946594,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 64
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1689.0,
"completions/mean_length": 1618.453125,
"completions/mean_terminated_length": 673.4500122070312,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.07428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18873563408851624,
"learning_rate": 8.737029101523929e-07,
"loss": 0.0291,
"num_tokens": 8021878.0,
"reward": -0.08997441828250885,
"reward_std": 0.3741680085659027,
"rewards/cosine_scaled_reward/mean": -0.20904971659183502,
"rewards/cosine_scaled_reward/std": 0.35118550062179565,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 65
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1513.0,
"completions/mean_length": 1217.859375,
"completions/mean_terminated_length": 719.7750244140625,
"completions/min_length": 226.0,
"completions/min_terminated_length": 226.0,
"epoch": 0.07542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15865544974803925,
"learning_rate": 8.681980515339463e-07,
"loss": 0.0289,
"num_tokens": 8110053.0,
"reward": 0.5319543480873108,
"reward_std": 0.7594929337501526,
"rewards/cosine_scaled_reward/mean": -0.046522848308086395,
"rewards/cosine_scaled_reward/std": 0.47548800706863403,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 66
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1708.0,
"completions/mean_length": 1944.203125,
"completions/mean_terminated_length": 1309.888916015625,
"completions/min_length": 827.0,
"completions/min_terminated_length": 827.0,
"epoch": 0.07657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1986117660999298,
"learning_rate": 8.625962667065487e-07,
"loss": 0.0261,
"num_tokens": 8245218.0,
"reward": -0.3390616178512573,
"reward_std": 0.38135582208633423,
"rewards/cosine_scaled_reward/mean": -0.25546830892562866,
"rewards/cosine_scaled_reward/std": 0.1776033639907837,
"rewards/format_reward/mean": 0.171875,
"rewards/format_reward/std": 0.38025420904159546,
"step": 67
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 997.09375,
"completions/mean_terminated_length": 675.3877563476562,
"completions/min_length": 269.0,
"completions/min_terminated_length": 269.0,
"epoch": 0.07771428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13545449078083038,
"learning_rate": 8.568992620281243e-07,
"loss": 0.0228,
"num_tokens": 8318288.0,
"reward": 0.7294949293136597,
"reward_std": 0.5788470506668091,
"rewards/cosine_scaled_reward/mean": -0.018065020442008972,
"rewards/cosine_scaled_reward/std": 0.42799946665763855,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 68
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1640.0,
"completions/mean_length": 1297.90625,
"completions/mean_terminated_length": 817.076904296875,
"completions/min_length": 375.0,
"completions/min_terminated_length": 375.0,
"epoch": 0.07885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18707174062728882,
"learning_rate": 8.511087728614862e-07,
"loss": 0.0366,
"num_tokens": 8411698.0,
"reward": 0.20155681669712067,
"reward_std": 0.5115354061126709,
"rewards/cosine_scaled_reward/mean": -0.21172159910202026,
"rewards/cosine_scaled_reward/std": 0.30984631180763245,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 69
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1986.0,
"completions/mean_length": 1842.484375,
"completions/mean_terminated_length": 951.9166870117188,
"completions/min_length": 444.0,
"completions/min_terminated_length": 444.0,
"epoch": 0.08,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1955178827047348,
"learning_rate": 8.452265630457282e-07,
"loss": 0.0137,
"num_tokens": 8541073.0,
"reward": -0.1862781047821045,
"reward_std": 0.5197064876556396,
"rewards/cosine_scaled_reward/mean": -0.21032655239105225,
"rewards/cosine_scaled_reward/std": 0.26505687832832336,
"rewards/format_reward/mean": 0.234375,
"rewards/format_reward/std": 0.42695629596710205,
"step": 70
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 1553.671875,
"completions/mean_terminated_length": 831.1923217773438,
"completions/min_length": 356.0,
"completions/min_terminated_length": 356.0,
"epoch": 0.08114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19525648653507233,
"learning_rate": 8.392544243589427e-07,
"loss": 0.0346,
"num_tokens": 8651228.0,
"reward": 0.3039510250091553,
"reward_std": 0.7005565762519836,
"rewards/cosine_scaled_reward/mean": -0.05114949122071266,
"rewards/cosine_scaled_reward/std": 0.47836223244667053,
"rewards/format_reward/mean": 0.40625,
"rewards/format_reward/std": 0.49501484632492065,
"step": 71
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1733.0,
"completions/mean_length": 1406.8125,
"completions/mean_terminated_length": 841.058837890625,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.08228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17535589635372162,
"learning_rate": 8.331941759724268e-07,
"loss": 0.0434,
"num_tokens": 8751616.0,
"reward": 0.21918153762817383,
"reward_std": 0.4695218801498413,
"rewards/cosine_scaled_reward/mean": -0.1716592162847519,
"rewards/cosine_scaled_reward/std": 0.21545428037643433,
"rewards/format_reward/mean": 0.5625,
"rewards/format_reward/std": 0.5,
"step": 72
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.84375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1867.0,
"completions/mean_length": 1944.015625,
"completions/mean_terminated_length": 1382.5,
"completions/min_length": 951.0,
"completions/min_terminated_length": 951.0,
"epoch": 0.08342857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21378304064273834,
"learning_rate": 8.270476638965461e-07,
"loss": 0.0319,
"num_tokens": 8886761.0,
"reward": -0.37639105319976807,
"reward_std": 0.4715355932712555,
"rewards/cosine_scaled_reward/mean": -0.26632052659988403,
"rewards/cosine_scaled_reward/std": 0.23604609072208405,
"rewards/format_reward/mean": 0.15625,
"rewards/format_reward/std": 0.36596253514289856,
"step": 73
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1745.1875,
"completions/mean_terminated_length": 1125.142822265625,
"completions/min_length": 590.0,
"completions/min_terminated_length": 590.0,
"epoch": 0.08457142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17263740301132202,
"learning_rate": 8.208167604184217e-07,
"loss": 0.0465,
"num_tokens": 9008381.0,
"reward": 0.2182944416999817,
"reward_std": 0.5771346092224121,
"rewards/cosine_scaled_reward/mean": -0.06272779405117035,
"rewards/cosine_scaled_reward/std": 0.4549061059951782,
"rewards/format_reward/mean": 0.34375,
"rewards/format_reward/std": 0.4787135720252991,
"step": 74
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1717.484375,
"completions/mean_terminated_length": 1292.5357666015625,
"completions/min_length": 660.0,
"completions/min_terminated_length": 660.0,
"epoch": 0.08571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19337797164916992,
"learning_rate": 8.145033635316128e-07,
"loss": 0.0729,
"num_tokens": 9129260.0,
"reward": 0.32763606309890747,
"reward_std": 0.5694445371627808,
"rewards/cosine_scaled_reward/mean": -0.07055696845054626,
"rewards/cosine_scaled_reward/std": 0.48110467195510864,
"rewards/format_reward/mean": 0.46875,
"rewards/format_reward/std": 0.5029674172401428,
"step": 75
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1544.0,
"completions/mean_length": 1465.96875,
"completions/mean_terminated_length": 883.9375,
"completions/min_length": 391.0,
"completions/min_terminated_length": 391.0,
"epoch": 0.08685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18641816079616547,
"learning_rate": 8.081093963579707e-07,
"loss": 0.018,
"num_tokens": 9233482.0,
"reward": -0.015750765800476074,
"reward_std": 0.4846976697444916,
"rewards/cosine_scaled_reward/mean": -0.26568788290023804,
"rewards/cosine_scaled_reward/std": 0.22177822887897491,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.5037065148353577,
"step": 76
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1673.03125,
"completions/mean_terminated_length": 1004.6087036132812,
"completions/min_length": 574.0,
"completions/min_terminated_length": 574.0,
"epoch": 0.088,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1897410899400711,
"learning_rate": 8.01636806561836e-07,
"loss": 0.0314,
"num_tokens": 9352132.0,
"reward": 0.2287338674068451,
"reward_std": 0.3902437686920166,
"rewards/cosine_scaled_reward/mean": -0.06532055139541626,
"rewards/cosine_scaled_reward/std": 0.3456854224205017,
"rewards/format_reward/mean": 0.359375,
"rewards/format_reward/std": 0.4836103618144989,
"step": 77
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.640625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1847.0,
"completions/mean_length": 1693.9375,
"completions/mean_terminated_length": 1062.7825927734375,
"completions/min_length": 541.0,
"completions/min_terminated_length": 541.0,
"epoch": 0.08914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19339214265346527,
"learning_rate": 7.950875657567621e-07,
"loss": 0.0274,
"num_tokens": 9471264.0,
"reward": 0.21436695754528046,
"reward_std": 0.47337740659713745,
"rewards/cosine_scaled_reward/mean": -0.08031650632619858,
"rewards/cosine_scaled_reward/std": 0.45170801877975464,
"rewards/format_reward/mean": 0.375,
"rewards/format_reward/std": 0.48795005679130554,
"step": 78
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1369.875,
"completions/mean_terminated_length": 875.0270385742188,
"completions/min_length": 216.0,
"completions/min_terminated_length": 216.0,
"epoch": 0.09028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16123633086681366,
"learning_rate": 7.884636689049422e-07,
"loss": 0.028,
"num_tokens": 9569032.0,
"reward": 0.34234681725502014,
"reward_std": 0.5154464840888977,
"rewards/cosine_scaled_reward/mean": -0.12570157647132874,
"rewards/cosine_scaled_reward/std": 0.37586960196495056,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 79
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.59375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1933.0,
"completions/mean_length": 1728.5,
"completions/mean_terminated_length": 1261.5384521484375,
"completions/min_length": 521.0,
"completions/min_terminated_length": 521.0,
"epoch": 0.09142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2202686369419098,
"learning_rate": 7.817671337095244e-07,
"loss": 0.0315,
"num_tokens": 9690048.0,
"reward": 0.14108766615390778,
"reward_std": 0.6474246382713318,
"rewards/cosine_scaled_reward/mean": -0.1482061743736267,
"rewards/cosine_scaled_reward/std": 0.35231441259384155,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 80
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1512.5,
"completions/mean_terminated_length": 942.4515991210938,
"completions/min_length": 276.0,
"completions/min_terminated_length": 276.0,
"epoch": 0.09257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2079077959060669,
"learning_rate": 7.75e-07,
"loss": 0.0289,
"num_tokens": 9798120.0,
"reward": 0.18197788298130035,
"reward_std": 0.6896297931671143,
"rewards/cosine_scaled_reward/mean": -0.17463606595993042,
"rewards/cosine_scaled_reward/std": 0.33339765667915344,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 81
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1964.0,
"completions/mean_length": 1331.59375,
"completions/mean_terminated_length": 981.720947265625,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.09371428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17440767586231232,
"learning_rate": 7.681643291108517e-07,
"loss": 0.0501,
"num_tokens": 9893670.0,
"reward": 0.6656259894371033,
"reward_std": 0.47437405586242676,
"rewards/cosine_scaled_reward/mean": -0.010936971753835678,
"rewards/cosine_scaled_reward/std": 0.51872718334198,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 82
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.421875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1630.0,
"completions/mean_length": 1375.609375,
"completions/mean_terminated_length": 884.9459838867188,
"completions/min_length": 515.0,
"completions/min_terminated_length": 515.0,
"epoch": 0.09485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1706554889678955,
"learning_rate": 7.612622032536507e-07,
"loss": 0.0555,
"num_tokens": 9992797.0,
"reward": 0.5926185846328735,
"reward_std": 0.5033661127090454,
"rewards/cosine_scaled_reward/mean": -0.008378200232982635,
"rewards/cosine_scaled_reward/std": 0.43848997354507446,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 83
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.453125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2017.0,
"completions/mean_length": 1469.953125,
"completions/mean_terminated_length": 991.0,
"completions/min_length": 415.0,
"completions/min_terminated_length": 415.0,
"epoch": 0.096,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.162735253572464,
"learning_rate": 7.54295724882796e-07,
"loss": 0.0524,
"num_tokens": 10097570.0,
"reward": 0.305806964635849,
"reward_std": 0.7360225915908813,
"rewards/cosine_scaled_reward/mean": -0.1361590325832367,
"rewards/cosine_scaled_reward/std": 0.3629942834377289,
"rewards/format_reward/mean": 0.578125,
"rewards/format_reward/std": 0.49776285886764526,
"step": 84
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1938.0,
"completions/mean_length": 1538.546875,
"completions/mean_terminated_length": 1271.6905517578125,
"completions/min_length": 757.0,
"completions/min_terminated_length": 757.0,
"epoch": 0.09714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1640138328075409,
"learning_rate": 7.472670160550848e-07,
"loss": 0.0237,
"num_tokens": 10206357.0,
"reward": 0.5352065563201904,
"reward_std": 0.6669929027557373,
"rewards/cosine_scaled_reward/mean": -0.10739670693874359,
"rewards/cosine_scaled_reward/std": 0.3617566227912903,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 85
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1729.0,
"completions/mean_length": 1533.96875,
"completions/mean_terminated_length": 1051.0909423828125,
"completions/min_length": 487.0,
"completions/min_terminated_length": 487.0,
"epoch": 0.09828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.20526152849197388,
"learning_rate": 7.401782177833147e-07,
"loss": 0.0296,
"num_tokens": 10315387.0,
"reward": 0.17176683247089386,
"reward_std": 0.5576786994934082,
"rewards/cosine_scaled_reward/mean": -0.17974159121513367,
"rewards/cosine_scaled_reward/std": 0.29362010955810547,
"rewards/format_reward/mean": 0.53125,
"rewards/format_reward/std": 0.5029674172401428,
"step": 86
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.265625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1923.0,
"completions/mean_length": 1203.109375,
"completions/mean_terminated_length": 897.5106201171875,
"completions/min_length": 294.0,
"completions/min_terminated_length": 294.0,
"epoch": 0.09942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15400013327598572,
"learning_rate": 7.330314893841101e-07,
"loss": 0.0372,
"num_tokens": 10402506.0,
"reward": 0.40166229009628296,
"reward_std": 0.5953558683395386,
"rewards/cosine_scaled_reward/mean": -0.15854386985301971,
"rewards/cosine_scaled_reward/std": 0.33755984902381897,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 87
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1830.0,
"completions/mean_length": 1300.203125,
"completions/mean_terminated_length": 820.84619140625,
"completions/min_length": 143.0,
"completions/min_terminated_length": 143.0,
"epoch": 0.10057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18996815383434296,
"learning_rate": 7.258290078201731e-07,
"loss": 0.0892,
"num_tokens": 10496231.0,
"reward": 0.8728382587432861,
"reward_std": 0.9383659958839417,
"rewards/cosine_scaled_reward/mean": 0.13173162937164307,
"rewards/cosine_scaled_reward/std": 0.4831489026546478,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 88
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1840.0,
"completions/mean_length": 1399.203125,
"completions/mean_terminated_length": 1145.3260498046875,
"completions/min_length": 404.0,
"completions/min_terminated_length": 404.0,
"epoch": 0.10171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16587954759597778,
"learning_rate": 7.185729670371604e-07,
"loss": 0.0474,
"num_tokens": 10596780.0,
"reward": 0.5412895679473877,
"reward_std": 0.7176238894462585,
"rewards/cosine_scaled_reward/mean": -0.12779270112514496,
"rewards/cosine_scaled_reward/std": 0.4147184491157532,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 89
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2042.0,
"completions/mean_length": 1320.28125,
"completions/mean_terminated_length": 989.5,
"completions/min_length": 146.0,
"completions/min_terminated_length": 146.0,
"epoch": 0.10285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18331165611743927,
"learning_rate": 7.11265577295385e-07,
"loss": 0.0604,
"num_tokens": 10691278.0,
"reward": 0.4162590801715851,
"reward_std": 0.5778031349182129,
"rewards/cosine_scaled_reward/mean": -0.15905795991420746,
"rewards/cosine_scaled_reward/std": 0.30715692043304443,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 90
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.53125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2028.0,
"completions/mean_length": 1667.765625,
"completions/mean_terminated_length": 1236.8333740234375,
"completions/min_length": 350.0,
"completions/min_terminated_length": 350.0,
"epoch": 0.104,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18394418060779572,
"learning_rate": 7.039090644965509e-07,
"loss": 0.0414,
"num_tokens": 10808583.0,
"reward": 0.3724231719970703,
"reward_std": 0.7342937588691711,
"rewards/cosine_scaled_reward/mean": -0.08722592890262604,
"rewards/cosine_scaled_reward/std": 0.40396004915237427,
"rewards/format_reward/mean": 0.546875,
"rewards/format_reward/std": 0.501733124256134,
"step": 91
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1795.0,
"completions/mean_length": 1328.3125,
"completions/mean_terminated_length": 924.5853271484375,
"completions/min_length": 315.0,
"completions/min_terminated_length": 315.0,
"epoch": 0.10514285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.155538409948349,
"learning_rate": 6.965056695057204e-07,
"loss": 0.0317,
"num_tokens": 10903587.0,
"reward": 0.27152636647224426,
"reward_std": 0.41478919982910156,
"rewards/cosine_scaled_reward/mean": -0.20017430186271667,
"rewards/cosine_scaled_reward/std": 0.21147257089614868,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 92
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1591.0,
"completions/mean_length": 1794.3125,
"completions/mean_terminated_length": 1193.4736328125,
"completions/min_length": 525.0,
"completions/min_terminated_length": 525.0,
"epoch": 0.10628571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22204875946044922,
"learning_rate": 6.890576474687263e-07,
"loss": 0.0395,
"num_tokens": 11029767.0,
"reward": -0.17630083858966827,
"reward_std": 0.37804993987083435,
"rewards/cosine_scaled_reward/mean": -0.2522129416465759,
"rewards/cosine_scaled_reward/std": 0.1775641143321991,
"rewards/format_reward/mean": 0.328125,
"rewards/format_reward/std": 0.4732423722743988,
"step": 93
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 1590.546875,
"completions/mean_terminated_length": 1002.3928833007812,
"completions/min_length": 456.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.10742857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2089747041463852,
"learning_rate": 6.815672671252315e-07,
"loss": 0.0437,
"num_tokens": 11141330.0,
"reward": 0.2654608488082886,
"reward_std": 0.46520984172821045,
"rewards/cosine_scaled_reward/mean": -0.08601956069469452,
"rewards/cosine_scaled_reward/std": 0.44666990637779236,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 94
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2034.0,
"completions/mean_length": 1713.703125,
"completions/mean_terminated_length": 1255.5926513671875,
"completions/min_length": 685.0,
"completions/min_terminated_length": 685.0,
"epoch": 0.10857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.18688982725143433,
"learning_rate": 6.740368101176495e-07,
"loss": 0.0541,
"num_tokens": 11261535.0,
"reward": -0.05255071818828583,
"reward_std": 0.5581967234611511,
"rewards/cosine_scaled_reward/mean": -0.2450253665447235,
"rewards/cosine_scaled_reward/std": 0.26258015632629395,
"rewards/format_reward/mean": 0.4375,
"rewards/format_reward/std": 0.5,
"step": 95
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1330.15625,
"completions/mean_terminated_length": 1110.4080810546875,
"completions/min_length": 494.0,
"completions/min_terminated_length": 494.0,
"epoch": 0.10971428571428571,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16858014464378357,
"learning_rate": 6.664685702961344e-07,
"loss": 0.0578,
"num_tokens": 11357801.0,
"reward": 0.6634380221366882,
"reward_std": 0.605156421661377,
"rewards/cosine_scaled_reward/mean": -0.05890599265694618,
"rewards/cosine_scaled_reward/std": 0.43818399310112,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 96
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1821.0,
"completions/mean_length": 1436.109375,
"completions/mean_terminated_length": 1157.977294921875,
"completions/min_length": 506.0,
"completions/min_terminated_length": 506.0,
"epoch": 0.11085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16547614336013794,
"learning_rate": 6.588648530198504e-07,
"loss": 0.0574,
"num_tokens": 11459976.0,
"reward": 0.4811592698097229,
"reward_std": 0.7198842167854309,
"rewards/cosine_scaled_reward/mean": -0.15004536509513855,
"rewards/cosine_scaled_reward/std": 0.3795333206653595,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 97
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1975.0,
"completions/mean_length": 1367.375,
"completions/mean_terminated_length": 1034.9766845703125,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.112,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16006368398666382,
"learning_rate": 6.512279744547392e-07,
"loss": 0.0562,
"num_tokens": 11558552.0,
"reward": 0.6184609532356262,
"reward_std": 0.6620975136756897,
"rewards/cosine_scaled_reward/mean": -0.05014452338218689,
"rewards/cosine_scaled_reward/std": 0.4317678213119507,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 98
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1973.0,
"completions/mean_length": 1430.109375,
"completions/mean_terminated_length": 1007.3421020507812,
"completions/min_length": 257.0,
"completions/min_terminated_length": 257.0,
"epoch": 0.11314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1656832993030548,
"learning_rate": 6.435602608679916e-07,
"loss": 0.0158,
"num_tokens": 11661247.0,
"reward": 0.5446015000343323,
"reward_std": 0.771472692489624,
"rewards/cosine_scaled_reward/mean": -0.06363675743341446,
"rewards/cosine_scaled_reward/std": 0.46424856781959534,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 99
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1241.515625,
"completions/mean_terminated_length": 1092.1666259765625,
"completions/min_length": 501.0,
"completions/min_terminated_length": 501.0,
"epoch": 0.11428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.156847283244133,
"learning_rate": 6.358640479194451e-07,
"loss": 0.0072,
"num_tokens": 11750416.0,
"reward": 0.8802791833877563,
"reward_std": 0.6878768801689148,
"rewards/cosine_scaled_reward/mean": 0.010452112182974815,
"rewards/cosine_scaled_reward/std": 0.44097819924354553,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 100
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1842.0,
"completions/mean_length": 1303.140625,
"completions/mean_terminated_length": 1054.854248046875,
"completions/min_length": 457.0,
"completions/min_terminated_length": 457.0,
"epoch": 0.11542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1940712332725525,
"learning_rate": 6.281416799501187e-07,
"loss": 0.0692,
"num_tokens": 11845249.0,
"reward": 0.6096305847167969,
"reward_std": 0.6024209260940552,
"rewards/cosine_scaled_reward/mean": -0.08580972999334335,
"rewards/cosine_scaled_reward/std": 0.4317456781864166,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 101
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1781.0,
"completions/mean_length": 1110.4375,
"completions/mean_terminated_length": 915.8490600585938,
"completions/min_length": 417.0,
"completions/min_terminated_length": 417.0,
"epoch": 0.11657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15823149681091309,
"learning_rate": 6.203955092681039e-07,
"loss": -0.01,
"num_tokens": 11926469.0,
"reward": 0.6660584211349487,
"reward_std": 0.6825114488601685,
"rewards/cosine_scaled_reward/mean": -0.09665828198194504,
"rewards/cosine_scaled_reward/std": 0.403474897146225,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 102
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2039.0,
"completions/mean_length": 1261.796875,
"completions/mean_terminated_length": 849.9761962890625,
"completions/min_length": 156.0,
"completions/min_terminated_length": 156.0,
"epoch": 0.11771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15815117955207825,
"learning_rate": 6.126278954320294e-07,
"loss": 0.034,
"num_tokens": 12017576.0,
"reward": 0.34080806374549866,
"reward_std": 0.6322569251060486,
"rewards/cosine_scaled_reward/mean": -0.19678348302841187,
"rewards/cosine_scaled_reward/std": 0.32747402787208557,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 103
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1834.0,
"completions/mean_length": 1391.296875,
"completions/mean_terminated_length": 941.9736938476562,
"completions/min_length": 376.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.11885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19132234156131744,
"learning_rate": 6.048412045323164e-07,
"loss": 0.0082,
"num_tokens": 12117083.0,
"reward": 0.37928348779678345,
"reward_std": 0.49314552545547485,
"rewards/cosine_scaled_reward/mean": -0.1306707262992859,
"rewards/cosine_scaled_reward/std": 0.32499685883522034,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 104
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1541.0,
"completions/mean_length": 1213.046875,
"completions/mean_terminated_length": 886.3261108398438,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.12,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14559786021709442,
"learning_rate": 5.97037808470444e-07,
"loss": 0.0717,
"num_tokens": 12206006.0,
"reward": 0.682517409324646,
"reward_std": 0.7797682285308838,
"rewards/cosine_scaled_reward/mean": -0.04155381768941879,
"rewards/cosine_scaled_reward/std": 0.46402302384376526,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 105
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1967.0,
"completions/mean_length": 1300.3125,
"completions/mean_terminated_length": 880.8779907226562,
"completions/min_length": 266.0,
"completions/min_terminated_length": 266.0,
"epoch": 0.12114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1580500453710556,
"learning_rate": 5.892200842364462e-07,
"loss": 0.0158,
"num_tokens": 12300058.0,
"reward": 0.8088675737380981,
"reward_std": 0.6575020551681519,
"rewards/cosine_scaled_reward/mean": 0.06849630177021027,
"rewards/cosine_scaled_reward/std": 0.44056057929992676,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 106
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1818.0,
"completions/mean_length": 1324.09375,
"completions/mean_terminated_length": 917.9999389648438,
"completions/min_length": 500.0,
"completions/min_terminated_length": 500.0,
"epoch": 0.12228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1742219179868698,
"learning_rate": 5.813904131848564e-07,
"loss": 0.0609,
"num_tokens": 12395952.0,
"reward": 0.3355618119239807,
"reward_std": 0.6852389574050903,
"rewards/cosine_scaled_reward/mean": -0.19159409403800964,
"rewards/cosine_scaled_reward/std": 0.36729469895362854,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 107
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2046.0,
"completions/mean_length": 1475.21875,
"completions/mean_terminated_length": 1083.3157958984375,
"completions/min_length": 441.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.12342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17198115587234497,
"learning_rate": 5.735511803093248e-07,
"loss": 0.0333,
"num_tokens": 12500726.0,
"reward": 0.4056100845336914,
"reward_std": 0.8831891417503357,
"rewards/cosine_scaled_reward/mean": -0.11750747263431549,
"rewards/cosine_scaled_reward/std": 0.43770650029182434,
"rewards/format_reward/mean": 0.640625,
"rewards/format_reward/std": 0.4836103618144989,
"step": 108
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1918.0,
"completions/mean_length": 1491.984375,
"completions/mean_terminated_length": 969.6666870117188,
"completions/min_length": 413.0,
"completions/min_terminated_length": 413.0,
"epoch": 0.12457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17869649827480316,
"learning_rate": 5.657047735161255e-07,
"loss": 0.0269,
"num_tokens": 12607461.0,
"reward": 0.43367689847946167,
"reward_std": 0.4224759340286255,
"rewards/cosine_scaled_reward/mean": -0.08003655076026917,
"rewards/cosine_scaled_reward/std": 0.4198642373085022,
"rewards/format_reward/mean": 0.59375,
"rewards/format_reward/std": 0.49501484632492065,
"step": 109
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1827.0,
"completions/mean_length": 1304.03125,
"completions/mean_terminated_length": 886.6829223632812,
"completions/min_length": 297.0,
"completions/min_terminated_length": 297.0,
"epoch": 0.12571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16700071096420288,
"learning_rate": 5.578535828967777e-07,
"loss": 0.0425,
"num_tokens": 12701695.0,
"reward": 0.28147247433662415,
"reward_std": 0.5220406651496887,
"rewards/cosine_scaled_reward/mean": -0.21082626283168793,
"rewards/cosine_scaled_reward/std": 0.26231563091278076,
"rewards/format_reward/mean": 0.703125,
"rewards/format_reward/std": 0.4604927599430084,
"step": 110
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2015.0,
"completions/mean_length": 1238.453125,
"completions/mean_terminated_length": 1051.6346435546875,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.12685714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1514654904603958,
"learning_rate": 5.5e-07,
"loss": 0.0223,
"num_tokens": 12791076.0,
"reward": 0.6152657270431519,
"reward_std": 0.724465012550354,
"rewards/cosine_scaled_reward/mean": -0.12986713647842407,
"rewards/cosine_scaled_reward/std": 0.3984290361404419,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 111
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.515625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1637.421875,
"completions/mean_terminated_length": 1200.3548583984375,
"completions/min_length": 493.0,
"completions/min_terminated_length": 493.0,
"epoch": 0.128,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.19140374660491943,
"learning_rate": 5.421464171032224e-07,
"loss": 0.0523,
"num_tokens": 12906967.0,
"reward": 0.3433418571949005,
"reward_std": 0.7630938291549683,
"rewards/cosine_scaled_reward/mean": -0.08614157140254974,
"rewards/cosine_scaled_reward/std": 0.39247801899909973,
"rewards/format_reward/mean": 0.515625,
"rewards/format_reward/std": 0.5037065148353577,
"step": 112
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1730.0,
"completions/mean_length": 1076.734375,
"completions/mean_terminated_length": 976.2586059570312,
"completions/min_length": 441.0,
"completions/min_terminated_length": 441.0,
"epoch": 0.12914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1517946422100067,
"learning_rate": 5.342952264838747e-07,
"loss": 0.0395,
"num_tokens": 12986174.0,
"reward": 0.8363064527511597,
"reward_std": 0.7532539367675781,
"rewards/cosine_scaled_reward/mean": -0.034971803426742554,
"rewards/cosine_scaled_reward/std": 0.4582027494907379,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 113
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1117.0,
"completions/mean_terminated_length": 923.7736206054688,
"completions/min_length": 274.0,
"completions/min_terminated_length": 274.0,
"epoch": 0.13028571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13561034202575684,
"learning_rate": 5.264488196906752e-07,
"loss": -0.0089,
"num_tokens": 13067510.0,
"reward": 0.5447627305984497,
"reward_std": 0.5072149038314819,
"rewards/cosine_scaled_reward/mean": -0.18855616450309753,
"rewards/cosine_scaled_reward/std": 0.30408668518066406,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 114
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1386.25,
"completions/mean_terminated_length": 1039.6190185546875,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.13142857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15800845623016357,
"learning_rate": 5.186095868151436e-07,
"loss": 0.0471,
"num_tokens": 13167446.0,
"reward": 0.3318045139312744,
"reward_std": 0.42589348554611206,
"rewards/cosine_scaled_reward/mean": -0.201285257935524,
"rewards/cosine_scaled_reward/std": 0.3381516635417938,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 115
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.46875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2022.0,
"completions/mean_length": 1590.546875,
"completions/mean_terminated_length": 1186.9117431640625,
"completions/min_length": 360.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.13257142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1938212811946869,
"learning_rate": 5.107799157635538e-07,
"loss": 0.0594,
"num_tokens": 13280625.0,
"reward": 0.33110541105270386,
"reward_std": 0.8637042045593262,
"rewards/cosine_scaled_reward/mean": -0.14694729447364807,
"rewards/cosine_scaled_reward/std": 0.4159691035747528,
"rewards/format_reward/mean": 0.625,
"rewards/format_reward/std": 0.48795005679130554,
"step": 116
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2014.0,
"completions/mean_length": 1270.0625,
"completions/mean_terminated_length": 1052.239990234375,
"completions/min_length": 345.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.1337142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1720697581768036,
"learning_rate": 5.02962191529556e-07,
"loss": 0.0298,
"num_tokens": 13372933.0,
"reward": 0.4778579771518707,
"reward_std": 0.7142170667648315,
"rewards/cosine_scaled_reward/mean": -0.15950849652290344,
"rewards/cosine_scaled_reward/std": 0.3388991951942444,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 117
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1366.125,
"completions/mean_terminated_length": 1078.2222900390625,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.13485714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15761680901050568,
"learning_rate": 4.951587954676837e-07,
"loss": -0.0003,
"num_tokens": 13470901.0,
"reward": 0.8694435954093933,
"reward_std": 0.7288352847099304,
"rewards/cosine_scaled_reward/mean": 0.05972181260585785,
"rewards/cosine_scaled_reward/std": 0.5152958035469055,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 118
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2048.0,
"completions/mean_length": 1087.609375,
"completions/mean_terminated_length": 888.2830200195312,
"completions/min_length": 298.0,
"completions/min_terminated_length": 298.0,
"epoch": 0.136,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1559121459722519,
"learning_rate": 4.873721045679706e-07,
"loss": -0.0042,
"num_tokens": 13550916.0,
"reward": 0.7770103216171265,
"reward_std": 0.6413853764533997,
"rewards/cosine_scaled_reward/mean": -0.04118231683969498,
"rewards/cosine_scaled_reward/std": 0.4513413906097412,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 119
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1954.0,
"completions/mean_length": 892.9375,
"completions/mean_terminated_length": 855.6773681640625,
"completions/min_length": 221.0,
"completions/min_terminated_length": 221.0,
"epoch": 0.13714285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15053324401378632,
"learning_rate": 4.79604490731896e-07,
"loss": 0.0091,
"num_tokens": 13618520.0,
"reward": 0.8417136073112488,
"reward_std": 0.5830849409103394,
"rewards/cosine_scaled_reward/mean": -0.06351819634437561,
"rewards/cosine_scaled_reward/std": 0.4826962351799011,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 120
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1640.0,
"completions/mean_length": 737.3125,
"completions/mean_terminated_length": 716.5079956054688,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.1382857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12602098286151886,
"learning_rate": 4.7185832004988133e-07,
"loss": -0.012,
"num_tokens": 13675404.0,
"reward": 1.0489320755004883,
"reward_std": 0.571333646774292,
"rewards/cosine_scaled_reward/mean": 0.02446599304676056,
"rewards/cosine_scaled_reward/std": 0.4796800911426544,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 121
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1387.875,
"completions/mean_terminated_length": 1109.1556396484375,
"completions/min_length": 378.0,
"completions/min_terminated_length": 378.0,
"epoch": 0.13942857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1755150556564331,
"learning_rate": 4.641359520805548e-07,
"loss": 0.0142,
"num_tokens": 13775788.0,
"reward": 0.535797119140625,
"reward_std": 0.7094154357910156,
"rewards/cosine_scaled_reward/mean": -0.1071014553308487,
"rewards/cosine_scaled_reward/std": 0.40171730518341064,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 122
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1906.0,
"completions/mean_length": 1303.28125,
"completions/mean_terminated_length": 1011.8695678710938,
"completions/min_length": 419.0,
"completions/min_terminated_length": 419.0,
"epoch": 0.14057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1657014787197113,
"learning_rate": 4.5643973913200837e-07,
"loss": 0.0521,
"num_tokens": 13869430.0,
"reward": 0.4624041020870209,
"reward_std": 0.4812185764312744,
"rewards/cosine_scaled_reward/mean": -0.15942296385765076,
"rewards/cosine_scaled_reward/std": 0.38984978199005127,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 123
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2021.0,
"completions/mean_length": 1265.578125,
"completions/mean_terminated_length": 1046.5,
"completions/min_length": 414.0,
"completions/min_terminated_length": 414.0,
"epoch": 0.1417142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13778002560138702,
"learning_rate": 4.4877202554526084e-07,
"loss": 0.0275,
"num_tokens": 13961379.0,
"reward": 0.772553563117981,
"reward_std": 0.6200233101844788,
"rewards/cosine_scaled_reward/mean": -0.004348240792751312,
"rewards/cosine_scaled_reward/std": 0.4568707346916199,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 124
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1370.4375,
"completions/mean_terminated_length": 1084.3555908203125,
"completions/min_length": 395.0,
"completions/min_terminated_length": 395.0,
"epoch": 0.14285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15328042209148407,
"learning_rate": 4.4113514698014953e-07,
"loss": -0.0036,
"num_tokens": 14060015.0,
"reward": 0.751072883605957,
"reward_std": 0.6632312536239624,
"rewards/cosine_scaled_reward/mean": 0.0005364194512367249,
"rewards/cosine_scaled_reward/std": 0.4951366186141968,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 125
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.296875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2011.0,
"completions/mean_length": 1375.4375,
"completions/mean_terminated_length": 1091.4666748046875,
"completions/min_length": 541.0,
"completions/min_terminated_length": 541.0,
"epoch": 0.144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17353153228759766,
"learning_rate": 4.3353142970386557e-07,
"loss": 0.0196,
"num_tokens": 14159339.0,
"reward": 0.5659087300300598,
"reward_std": 0.7280570268630981,
"rewards/cosine_scaled_reward/mean": -0.0764206275343895,
"rewards/cosine_scaled_reward/std": 0.3900720179080963,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 126
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1968.0,
"completions/mean_length": 1305.984375,
"completions/mean_terminated_length": 1078.836669921875,
"completions/min_length": 646.0,
"completions/min_terminated_length": 646.0,
"epoch": 0.14514285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15398724377155304,
"learning_rate": 4.2596318988235037e-07,
"loss": 0.1028,
"num_tokens": 14254018.0,
"reward": 0.46388188004493713,
"reward_std": 0.730462372303009,
"rewards/cosine_scaled_reward/mean": -0.15868405997753143,
"rewards/cosine_scaled_reward/std": 0.3666206896305084,
"rewards/format_reward/mean": 0.78125,
"rewards/format_reward/std": 0.4166666865348816,
"step": 127
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1916.0,
"completions/mean_length": 1140.96875,
"completions/mean_terminated_length": 1029.5789794921875,
"completions/min_length": 503.0,
"completions/min_terminated_length": 503.0,
"epoch": 0.1462857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1500931680202484,
"learning_rate": 4.1843273287476854e-07,
"loss": 0.0234,
"num_tokens": 14337392.0,
"reward": 0.9014174342155457,
"reward_std": 0.7117189764976501,
"rewards/cosine_scaled_reward/mean": -0.018041294068098068,
"rewards/cosine_scaled_reward/std": 0.44308605790138245,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 128
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.34375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1829.0,
"completions/mean_length": 1510.8125,
"completions/mean_terminated_length": 1229.4285888671875,
"completions/min_length": 456.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.14742857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17119824886322021,
"learning_rate": 4.1094235253127374e-07,
"loss": 0.0387,
"num_tokens": 14444788.0,
"reward": 0.49041426181793213,
"reward_std": 0.7674543857574463,
"rewards/cosine_scaled_reward/mean": -0.08291786164045334,
"rewards/cosine_scaled_reward/std": 0.4110357463359833,
"rewards/format_reward/mean": 0.65625,
"rewards/format_reward/std": 0.4787135720252991,
"step": 129
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1295.78125,
"completions/mean_terminated_length": 953.8636474609375,
"completions/min_length": 355.0,
"completions/min_terminated_length": 355.0,
"epoch": 0.14857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1408931016921997,
"learning_rate": 4.034943304942796e-07,
"loss": 0.0237,
"num_tokens": 14538222.0,
"reward": 0.3384738564491272,
"reward_std": 0.5595801472663879,
"rewards/cosine_scaled_reward/mean": -0.1979505866765976,
"rewards/cosine_scaled_reward/std": 0.307023286819458,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 130
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 1266.53125,
"completions/mean_terminated_length": 1047.719970703125,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.14971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15982288122177124,
"learning_rate": 3.9609093550344907e-07,
"loss": 0.051,
"num_tokens": 14629016.0,
"reward": 0.8806734085083008,
"reward_std": 0.7300256490707397,
"rewards/cosine_scaled_reward/mean": 0.0340866819024086,
"rewards/cosine_scaled_reward/std": 0.44312214851379395,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 131
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.390625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1772.0,
"completions/mean_length": 1407.046875,
"completions/mean_terminated_length": 996.1795043945312,
"completions/min_length": 64.0,
"completions/min_terminated_length": 64.0,
"epoch": 0.15085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1669837236404419,
"learning_rate": 3.8873442270461485e-07,
"loss": 0.0031,
"num_tokens": 14730131.0,
"reward": 0.5413260459899902,
"reward_std": 0.7315264940261841,
"rewards/cosine_scaled_reward/mean": -0.034024473279714584,
"rewards/cosine_scaled_reward/std": 0.49355971813201904,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 132
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1997.0,
"completions/mean_length": 1341.828125,
"completions/mean_terminated_length": 1020.8409423828125,
"completions/min_length": 454.0,
"completions/min_terminated_length": 454.0,
"epoch": 0.152,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1627449095249176,
"learning_rate": 3.8142703296283953e-07,
"loss": 0.057,
"num_tokens": 14827232.0,
"reward": 0.48581433296203613,
"reward_std": 0.6289799213409424,
"rewards/cosine_scaled_reward/mean": -0.13990533351898193,
"rewards/cosine_scaled_reward/std": 0.3319030702114105,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 133
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2023.0,
"completions/mean_length": 1087.90625,
"completions/mean_terminated_length": 1006.5423583984375,
"completions/min_length": 450.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.15314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13660290837287903,
"learning_rate": 3.7417099217982686e-07,
"loss": 0.0397,
"num_tokens": 14907426.0,
"reward": 1.1596651077270508,
"reward_std": 0.5051962733268738,
"rewards/cosine_scaled_reward/mean": 0.1032700166106224,
"rewards/cosine_scaled_reward/std": 0.5394149422645569,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 134
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1921.0,
"completions/mean_length": 809.203125,
"completions/mean_terminated_length": 789.5397338867188,
"completions/min_length": 304.0,
"completions/min_terminated_length": 304.0,
"epoch": 0.15428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14757691323757172,
"learning_rate": 3.6696851061588994e-07,
"loss": -0.0311,
"num_tokens": 14969687.0,
"reward": 1.3557740449905396,
"reward_std": 0.6381043791770935,
"rewards/cosine_scaled_reward/mean": 0.18569952249526978,
"rewards/cosine_scaled_reward/std": 0.48723727464675903,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 135
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1900.0,
"completions/mean_length": 1199.28125,
"completions/mean_terminated_length": 1095.0526123046875,
"completions/min_length": 402.0,
"completions/min_terminated_length": 402.0,
"epoch": 0.15542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15660341084003448,
"learning_rate": 3.5982178221668533e-07,
"loss": 0.0159,
"num_tokens": 15057113.0,
"reward": 0.8486931920051575,
"reward_std": 0.7802823781967163,
"rewards/cosine_scaled_reward/mean": -0.05221588909626007,
"rewards/cosine_scaled_reward/std": 0.46035289764404297,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 136
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2012.0,
"completions/mean_length": 1251.625,
"completions/mean_terminated_length": 1169.2413330078125,
"completions/min_length": 499.0,
"completions/min_terminated_length": 499.0,
"epoch": 0.15657142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13971418142318726,
"learning_rate": 3.5273298394491515e-07,
"loss": 0.021,
"num_tokens": 15147825.0,
"reward": 0.73081374168396,
"reward_std": 0.6755009293556213,
"rewards/cosine_scaled_reward/mean": -0.11115560680627823,
"rewards/cosine_scaled_reward/std": 0.3759666979312897,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 137
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 1242.625,
"completions/mean_terminated_length": 974.1666870117188,
"completions/min_length": 366.0,
"completions/min_terminated_length": 366.0,
"epoch": 0.15771428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1405755877494812,
"learning_rate": 3.45704275117204e-07,
"loss": 0.0063,
"num_tokens": 15238665.0,
"reward": 0.6383824348449707,
"reward_std": 0.5606896281242371,
"rewards/cosine_scaled_reward/mean": -0.10268379747867584,
"rewards/cosine_scaled_reward/std": 0.4464382231235504,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 138
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1168.5625,
"completions/mean_terminated_length": 1125.3114013671875,
"completions/min_length": 329.0,
"completions/min_terminated_length": 329.0,
"epoch": 0.15885714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1392800360918045,
"learning_rate": 3.387377967463493e-07,
"loss": 0.0174,
"num_tokens": 15324093.0,
"reward": 0.819107174873352,
"reward_std": 0.6458143591880798,
"rewards/cosine_scaled_reward/mean": -0.07482142746448517,
"rewards/cosine_scaled_reward/std": 0.447048157453537,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 139
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1163.484375,
"completions/mean_terminated_length": 1037.125,
"completions/min_length": 429.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.16,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.16070181131362915,
"learning_rate": 3.3183567088914833e-07,
"loss": 0.0484,
"num_tokens": 15408780.0,
"reward": 0.8035323619842529,
"reward_std": 0.5583758354187012,
"rewards/cosine_scaled_reward/mean": -0.04354630410671234,
"rewards/cosine_scaled_reward/std": 0.5074254870414734,
"rewards/format_reward/mean": 0.890625,
"rewards/format_reward/std": 0.3145764470100403,
"step": 140
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1854.0,
"completions/mean_length": 1113.328125,
"completions/mean_terminated_length": 1067.360595703125,
"completions/min_length": 456.0,
"completions/min_terminated_length": 456.0,
"epoch": 0.16114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13795652985572815,
"learning_rate": 3.250000000000001e-07,
"loss": 0.0117,
"num_tokens": 15490593.0,
"reward": 0.875638484954834,
"reward_std": 0.5237586498260498,
"rewards/cosine_scaled_reward/mean": -0.04655580222606659,
"rewards/cosine_scaled_reward/std": 0.49675485491752625,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 141
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2040.0,
"completions/mean_length": 1533.078125,
"completions/mean_terminated_length": 1388.9000244140625,
"completions/min_length": 376.0,
"completions/min_terminated_length": 376.0,
"epoch": 0.16228571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17600150406360626,
"learning_rate": 3.182328662904756e-07,
"loss": 0.0284,
"num_tokens": 15599214.0,
"reward": 0.7818896770477295,
"reward_std": 0.8826224207878113,
"rewards/cosine_scaled_reward/mean": -0.04655518755316734,
"rewards/cosine_scaled_reward/std": 0.48367196321487427,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 142
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1773.0,
"completions/mean_length": 1053.65625,
"completions/mean_terminated_length": 1004.7540283203125,
"completions/min_length": 425.0,
"completions/min_terminated_length": 425.0,
"epoch": 0.16342857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14887015521526337,
"learning_rate": 3.115363310950578e-07,
"loss": -0.0172,
"num_tokens": 15677464.0,
"reward": 0.7807654738426208,
"reward_std": 0.6237885355949402,
"rewards/cosine_scaled_reward/mean": -0.10180474817752838,
"rewards/cosine_scaled_reward/std": 0.3677360713481903,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 143
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1881.0,
"completions/mean_length": 1199.390625,
"completions/mean_terminated_length": 1003.5577392578125,
"completions/min_length": 312.0,
"completions/min_terminated_length": 312.0,
"epoch": 0.16457142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1500861942768097,
"learning_rate": 3.0491243424323783e-07,
"loss": 0.0237,
"num_tokens": 15765713.0,
"reward": 1.0909240245819092,
"reward_std": 0.6815290451049805,
"rewards/cosine_scaled_reward/mean": 0.12358702719211578,
"rewards/cosine_scaled_reward/std": 0.5295576453208923,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 144
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1994.0,
"completions/mean_length": 999.78125,
"completions/mean_terminated_length": 910.9491577148438,
"completions/min_length": 351.0,
"completions/min_terminated_length": 351.0,
"epoch": 0.1657142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13538403809070587,
"learning_rate": 2.9836319343816397e-07,
"loss": 0.0454,
"num_tokens": 15839643.0,
"reward": 0.8412516713142395,
"reward_std": 0.5241255760192871,
"rewards/cosine_scaled_reward/mean": -0.05593665689229965,
"rewards/cosine_scaled_reward/std": 0.412396639585495,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 145
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1116.875,
"completions/mean_terminated_length": 1071.0819091796875,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.16685714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13304923474788666,
"learning_rate": 2.918906036420294e-07,
"loss": 0.0238,
"num_tokens": 15921739.0,
"reward": 0.6987366676330566,
"reward_std": 0.6291457414627075,
"rewards/cosine_scaled_reward/mean": -0.12719416618347168,
"rewards/cosine_scaled_reward/std": 0.4025166630744934,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 146
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2037.0,
"completions/mean_length": 1376.234375,
"completions/mean_terminated_length": 1188.1400146484375,
"completions/min_length": 470.0,
"completions/min_terminated_length": 470.0,
"epoch": 0.168,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17383964359760284,
"learning_rate": 2.854966364683872e-07,
"loss": 0.0816,
"num_tokens": 16020602.0,
"reward": 0.6075379848480225,
"reward_std": 0.6172347068786621,
"rewards/cosine_scaled_reward/mean": -0.12591850757598877,
"rewards/cosine_scaled_reward/std": 0.35805603861808777,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 147
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1895.0,
"completions/mean_length": 1046.640625,
"completions/mean_terminated_length": 961.7796630859375,
"completions/min_length": 286.0,
"completions/min_terminated_length": 286.0,
"epoch": 0.16914285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1272667795419693,
"learning_rate": 2.791832395815782e-07,
"loss": 0.0166,
"num_tokens": 16098771.0,
"reward": 0.831100583076477,
"reward_std": 0.46476393938064575,
"rewards/cosine_scaled_reward/mean": -0.06101220101118088,
"rewards/cosine_scaled_reward/std": 0.3975098729133606,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 148
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.484375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1946.0,
"completions/mean_length": 1548.46875,
"completions/mean_terminated_length": 1079.212158203125,
"completions/min_length": 636.0,
"completions/min_terminated_length": 636.0,
"epoch": 0.1702857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17459169030189514,
"learning_rate": 2.729523361034538e-07,
"loss": 0.0461,
"num_tokens": 16209769.0,
"reward": 0.2916935384273529,
"reward_std": 0.7498115301132202,
"rewards/cosine_scaled_reward/mean": -0.15884071588516235,
"rewards/cosine_scaled_reward/std": 0.3723042905330658,
"rewards/format_reward/mean": 0.609375,
"rewards/format_reward/std": 0.4917473793029785,
"step": 149
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2035.0,
"completions/mean_length": 1178.09375,
"completions/mean_terminated_length": 1088.1033935546875,
"completions/min_length": 331.0,
"completions/min_terminated_length": 331.0,
"epoch": 0.17142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13624173402786255,
"learning_rate": 2.6680582402757324e-07,
"loss": -0.0165,
"num_tokens": 16295671.0,
"reward": 0.8602047562599182,
"reward_std": 0.708452582359314,
"rewards/cosine_scaled_reward/mean": -0.06208515167236328,
"rewards/cosine_scaled_reward/std": 0.43239399790763855,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 150
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1794.0,
"completions/mean_length": 1131.15625,
"completions/mean_terminated_length": 961.370361328125,
"completions/min_length": 401.0,
"completions/min_terminated_length": 401.0,
"epoch": 0.17257142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15374499559402466,
"learning_rate": 2.6074557564105724e-07,
"loss": 0.0201,
"num_tokens": 16378745.0,
"reward": 1.0417982339859009,
"reward_std": 0.7430429458618164,
"rewards/cosine_scaled_reward/mean": 0.09121159464120865,
"rewards/cosine_scaled_reward/std": 0.4806567430496216,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 151
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1911.0,
"completions/mean_length": 1266.53125,
"completions/mean_terminated_length": 1006.0416870117188,
"completions/min_length": 170.0,
"completions/min_terminated_length": 170.0,
"epoch": 0.1737142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17976705729961395,
"learning_rate": 2.547734369542718e-07,
"loss": 0.059,
"num_tokens": 16470467.0,
"reward": 0.6073766946792603,
"reward_std": 0.7206203937530518,
"rewards/cosine_scaled_reward/mean": -0.07912418246269226,
"rewards/cosine_scaled_reward/std": 0.39534807205200195,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 152
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1211.15625,
"completions/mean_terminated_length": 1091.607177734375,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.17485714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14580874145030975,
"learning_rate": 2.488912271385139e-07,
"loss": 0.0205,
"num_tokens": 16559125.0,
"reward": 0.5799474716186523,
"reward_std": 0.5585569143295288,
"rewards/cosine_scaled_reward/mean": -0.17096377909183502,
"rewards/cosine_scaled_reward/std": 0.288126140832901,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 153
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1349.578125,
"completions/mean_terminated_length": 1154.02001953125,
"completions/min_length": 572.0,
"completions/min_terminated_length": 572.0,
"epoch": 0.176,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16763651371002197,
"learning_rate": 2.4310073797187573e-07,
"loss": 0.0217,
"num_tokens": 16656562.0,
"reward": 0.4374058246612549,
"reward_std": 0.5803461670875549,
"rewards/cosine_scaled_reward/mean": -0.19535958766937256,
"rewards/cosine_scaled_reward/std": 0.36079293489456177,
"rewards/format_reward/mean": 0.828125,
"rewards/format_reward/std": 0.38025420904159546,
"step": 154
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1642.0,
"completions/mean_length": 1151.65625,
"completions/mean_terminated_length": 800.9130859375,
"completions/min_length": 309.0,
"completions/min_terminated_length": 309.0,
"epoch": 0.17714285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14943136274814606,
"learning_rate": 2.374037332934512e-07,
"loss": 0.0004,
"num_tokens": 16740892.0,
"reward": 0.7691766023635864,
"reward_std": 0.6439853310585022,
"rewards/cosine_scaled_reward/mean": -0.021661702543497086,
"rewards/cosine_scaled_reward/std": 0.463664174079895,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 155
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1956.0,
"completions/mean_length": 1292.609375,
"completions/mean_terminated_length": 997.021728515625,
"completions/min_length": 440.0,
"completions/min_terminated_length": 440.0,
"epoch": 0.1782857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.138059601187706,
"learning_rate": 2.3180194846605364e-07,
"loss": 0.0125,
"num_tokens": 16833723.0,
"reward": 0.7149533033370972,
"reward_std": 0.7902576923370361,
"rewards/cosine_scaled_reward/mean": -0.0018983632326126099,
"rewards/cosine_scaled_reward/std": 0.49385347962379456,
"rewards/format_reward/mean": 0.71875,
"rewards/format_reward/std": 0.4531635046005249,
"step": 156
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2043.0,
"completions/mean_length": 1275.4375,
"completions/mean_terminated_length": 1038.938720703125,
"completions/min_length": 367.0,
"completions/min_terminated_length": 367.0,
"epoch": 0.17942857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16267189383506775,
"learning_rate": 2.2629708984760706e-07,
"loss": 0.0604,
"num_tokens": 16925311.0,
"reward": 0.48121872544288635,
"reward_std": 0.7515869140625,
"rewards/cosine_scaled_reward/mean": -0.15782812237739563,
"rewards/cosine_scaled_reward/std": 0.34000325202941895,
"rewards/format_reward/mean": 0.796875,
"rewards/format_reward/std": 0.40550529956817627,
"step": 157
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1001.390625,
"completions/mean_terminated_length": 912.6949462890625,
"completions/min_length": 218.0,
"completions/min_terminated_length": 218.0,
"epoch": 0.18057142857142858,
"frac_reward_zero_std": 0.125,
"grad_norm": 0.13367342948913574,
"learning_rate": 2.2089083427137329e-07,
"loss": 0.0333,
"num_tokens": 16999520.0,
"reward": 0.9302408695220947,
"reward_std": 0.558702826499939,
"rewards/cosine_scaled_reward/mean": -0.011442087590694427,
"rewards/cosine_scaled_reward/std": 0.4976855218410492,
"rewards/format_reward/mean": 0.953125,
"rewards/format_reward/std": 0.21304203569889069,
"step": 158
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2019.0,
"completions/mean_length": 1350.71875,
"completions/mean_terminated_length": 1155.47998046875,
"completions/min_length": 396.0,
"completions/min_terminated_length": 396.0,
"epoch": 0.18171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1456945538520813,
"learning_rate": 2.1558482853517253e-07,
"loss": 0.025,
"num_tokens": 17097646.0,
"reward": 0.5350826978683472,
"reward_std": 0.5987731218338013,
"rewards/cosine_scaled_reward/mean": -0.20120865106582642,
"rewards/cosine_scaled_reward/std": 0.3128848373889923,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 159
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1889.0,
"completions/mean_length": 1075.953125,
"completions/mean_terminated_length": 1028.1474609375,
"completions/min_length": 439.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.18285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14507974684238434,
"learning_rate": 2.1038068889975259e-07,
"loss": 0.0368,
"num_tokens": 17178091.0,
"reward": 1.2330971956253052,
"reward_std": 0.7280604243278503,
"rewards/cosine_scaled_reward/mean": 0.124361053109169,
"rewards/cosine_scaled_reward/std": 0.47822633385658264,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 160
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 1179.53125,
"completions/mean_terminated_length": 979.1154174804688,
"completions/min_length": 335.0,
"completions/min_terminated_length": 335.0,
"epoch": 0.184,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15625452995300293,
"learning_rate": 2.0528000059645995e-07,
"loss": 0.0581,
"num_tokens": 17263573.0,
"reward": 0.6913028955459595,
"reward_std": 0.7251037359237671,
"rewards/cosine_scaled_reward/mean": -0.07622354477643967,
"rewards/cosine_scaled_reward/std": 0.4124097228050232,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 161
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 1241.859375,
"completions/mean_terminated_length": 1016.1399536132812,
"completions/min_length": 345.0,
"completions/min_terminated_length": 345.0,
"epoch": 0.18514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1465344876050949,
"learning_rate": 2.0028431734436308e-07,
"loss": 0.0541,
"num_tokens": 17353356.0,
"reward": 0.5661511421203613,
"reward_std": 0.651351809501648,
"rewards/cosine_scaled_reward/mean": -0.14661191403865814,
"rewards/cosine_scaled_reward/std": 0.33989307284355164,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 162
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.28125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1764.0,
"completions/mean_length": 1244.15625,
"completions/mean_terminated_length": 929.6087036132812,
"completions/min_length": 390.0,
"completions/min_terminated_length": 390.0,
"epoch": 0.18628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16913798451423645,
"learning_rate": 1.9539516087697517e-07,
"loss": 0.028,
"num_tokens": 17444166.0,
"reward": 1.0750610828399658,
"reward_std": 0.7454421520233154,
"rewards/cosine_scaled_reward/mean": 0.17034301161766052,
"rewards/cosine_scaled_reward/std": 0.47072502970695496,
"rewards/format_reward/mean": 0.734375,
"rewards/format_reward/std": 0.44515693187713623,
"step": 163
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1824.0,
"completions/mean_length": 1044.09375,
"completions/mean_terminated_length": 940.2413940429688,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.18742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15672317147254944,
"learning_rate": 1.9061402047871833e-07,
"loss": 0.0608,
"num_tokens": 17521516.0,
"reward": 0.928094744682312,
"reward_std": 0.6140168309211731,
"rewards/cosine_scaled_reward/mean": 0.003109898418188095,
"rewards/cosine_scaled_reward/std": 0.44902321696281433,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 164
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2004.0,
"completions/mean_length": 1069.75,
"completions/mean_terminated_length": 1004.5333862304688,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.18857142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14553649723529816,
"learning_rate": 1.8594235253127372e-07,
"loss": 0.0582,
"num_tokens": 17601396.0,
"reward": 1.1307251453399658,
"reward_std": 0.5828652381896973,
"rewards/cosine_scaled_reward/mean": 0.08098758012056351,
"rewards/cosine_scaled_reward/std": 0.5071448087692261,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 165
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1631.0,
"completions/mean_length": 1283.75,
"completions/mean_terminated_length": 1107.3846435546875,
"completions/min_length": 553.0,
"completions/min_terminated_length": 553.0,
"epoch": 0.18971428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15118341147899628,
"learning_rate": 1.8138158006995363e-07,
"loss": -0.0021,
"num_tokens": 17695132.0,
"reward": 0.9083728790283203,
"reward_std": 0.6904245615005493,
"rewards/cosine_scaled_reward/mean": 0.016686435788869858,
"rewards/cosine_scaled_reward/std": 0.4635255038738251,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 166
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.03125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1992.0,
"completions/mean_length": 1001.71875,
"completions/mean_terminated_length": 967.9677124023438,
"completions/min_length": 464.0,
"completions/min_terminated_length": 464.0,
"epoch": 0.19085714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12545917928218842,
"learning_rate": 1.7693309235023127e-07,
"loss": 0.0616,
"num_tokens": 17770786.0,
"reward": 0.7054103016853333,
"reward_std": 0.5469927787780762,
"rewards/cosine_scaled_reward/mean": -0.13948234915733337,
"rewards/cosine_scaled_reward/std": 0.3140275478363037,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 167
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2000.0,
"completions/mean_length": 1226.34375,
"completions/mean_terminated_length": 1171.5667724609375,
"completions/min_length": 474.0,
"completions/min_terminated_length": 474.0,
"epoch": 0.192,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16034062206745148,
"learning_rate": 1.7259824442455923e-07,
"loss": -0.0032,
"num_tokens": 17860112.0,
"reward": 1.0590779781341553,
"reward_std": 0.6419005393981934,
"rewards/cosine_scaled_reward/mean": 0.037351518869400024,
"rewards/cosine_scaled_reward/std": 0.433667927980423,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 168
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1886.0,
"completions/mean_length": 1041.6875,
"completions/mean_terminated_length": 992.1966552734375,
"completions/min_length": 412.0,
"completions/min_terminated_length": 412.0,
"epoch": 0.19314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13765358924865723,
"learning_rate": 1.6837835672960831e-07,
"loss": -0.0146,
"num_tokens": 17937404.0,
"reward": 1.4680163860321045,
"reward_std": 0.5853168964385986,
"rewards/cosine_scaled_reward/mean": 0.23400816321372986,
"rewards/cosine_scaled_reward/std": 0.5452130436897278,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 169
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1982.0,
"completions/mean_length": 1089.796875,
"completions/mean_terminated_length": 890.924560546875,
"completions/min_length": 318.0,
"completions/min_terminated_length": 318.0,
"epoch": 0.19428571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13031044602394104,
"learning_rate": 1.6427471468404952e-07,
"loss": 0.0316,
"num_tokens": 18016935.0,
"reward": 0.9512024521827698,
"reward_std": 0.4455436170101166,
"rewards/cosine_scaled_reward/mean": 0.03810122609138489,
"rewards/cosine_scaled_reward/std": 0.466457724571228,
"rewards/format_reward/mean": 0.875,
"rewards/format_reward/std": 0.3333333432674408,
"step": 170
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.25,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1960.0,
"completions/mean_length": 1223.53125,
"completions/mean_terminated_length": 948.7083740234375,
"completions/min_length": 432.0,
"completions/min_terminated_length": 432.0,
"epoch": 0.19542857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1566823422908783,
"learning_rate": 1.6028856829700258e-07,
"loss": -0.0008,
"num_tokens": 18105633.0,
"reward": 0.699189305305481,
"reward_std": 0.6079459190368652,
"rewards/cosine_scaled_reward/mean": -0.02540534734725952,
"rewards/cosine_scaled_reward/std": 0.4247443377971649,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 171
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.1875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 1152.765625,
"completions/mean_terminated_length": 946.173095703125,
"completions/min_length": 164.0,
"completions/min_terminated_length": 164.0,
"epoch": 0.19657142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17446376383304596,
"learning_rate": 1.5642113178727193e-07,
"loss": 0.0925,
"num_tokens": 18190026.0,
"reward": 1.1308399438858032,
"reward_std": 0.7069583535194397,
"rewards/cosine_scaled_reward/mean": 0.13573244214057922,
"rewards/cosine_scaled_reward/std": 0.4778185784816742,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 172
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2029.0,
"completions/mean_length": 1048.71875,
"completions/mean_terminated_length": 905.96435546875,
"completions/min_length": 206.0,
"completions/min_terminated_length": 206.0,
"epoch": 0.1977142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13813698291778564,
"learning_rate": 1.5267358321348285e-07,
"loss": 0.0382,
"num_tokens": 18268456.0,
"reward": 0.727963387966156,
"reward_std": 0.5711731910705566,
"rewards/cosine_scaled_reward/mean": -0.0891432836651802,
"rewards/cosine_scaled_reward/std": 0.4088326096534729,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 173
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1869.0,
"completions/mean_length": 1010.03125,
"completions/mean_terminated_length": 958.9835815429688,
"completions/min_length": 423.0,
"completions/min_terminated_length": 423.0,
"epoch": 0.19885714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13912717998027802,
"learning_rate": 1.4904706411523448e-07,
"loss": 0.0267,
"num_tokens": 18343434.0,
"reward": 0.8880202770233154,
"reward_std": 0.7008156180381775,
"rewards/cosine_scaled_reward/mean": -0.040364816784858704,
"rewards/cosine_scaled_reward/std": 0.46977153420448303,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 174
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1985.0,
"completions/mean_length": 1065.96875,
"completions/mean_terminated_length": 1017.6720581054688,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.2,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14181731641292572,
"learning_rate": 1.4554267916537495e-07,
"loss": 0.0015,
"num_tokens": 18422480.0,
"reward": 0.8645844459533691,
"reward_std": 0.5794019103050232,
"rewards/cosine_scaled_reward/mean": -0.03645776957273483,
"rewards/cosine_scaled_reward/std": 0.4564404785633087,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 175
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2009.0,
"completions/mean_length": 1199.859375,
"completions/mean_terminated_length": 962.3800048828125,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"epoch": 0.20114285714285715,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14228306710720062,
"learning_rate": 1.4216149583350755e-07,
"loss": 0.0401,
"num_tokens": 18510439.0,
"reward": 0.5378038883209229,
"reward_std": 0.6905298829078674,
"rewards/cosine_scaled_reward/mean": -0.13734807074069977,
"rewards/cosine_scaled_reward/std": 0.3545166850090027,
"rewards/format_reward/mean": 0.8125,
"rewards/format_reward/std": 0.39339789748191833,
"step": 176
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1136.078125,
"completions/mean_terminated_length": 967.2037353515625,
"completions/min_length": 341.0,
"completions/min_terminated_length": 341.0,
"epoch": 0.2022857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15204809606075287,
"learning_rate": 1.3890454406082956e-07,
"loss": 0.0195,
"num_tokens": 18593372.0,
"reward": 0.8706955313682556,
"reward_std": 0.561739444732666,
"rewards/cosine_scaled_reward/mean": 0.013472765684127808,
"rewards/cosine_scaled_reward/std": 0.5127261877059937,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 177
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.09375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1090.78125,
"completions/mean_terminated_length": 991.7586059570312,
"completions/min_length": 410.0,
"completions/min_terminated_length": 410.0,
"epoch": 0.20342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14953841269016266,
"learning_rate": 1.3577281594640182e-07,
"loss": 0.0377,
"num_tokens": 18674718.0,
"reward": 1.023108720779419,
"reward_std": 0.6721357107162476,
"rewards/cosine_scaled_reward/mean": 0.042804330587387085,
"rewards/cosine_scaled_reward/std": 0.4879773259162903,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 178
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.328125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1739.0,
"completions/mean_length": 1336.46875,
"completions/mean_terminated_length": 988.9767456054688,
"completions/min_length": 450.0,
"completions/min_terminated_length": 450.0,
"epoch": 0.20457142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15529467165470123,
"learning_rate": 1.3276726544494571e-07,
"loss": 0.0596,
"num_tokens": 18771084.0,
"reward": 0.5006722211837769,
"reward_std": 0.8324818015098572,
"rewards/cosine_scaled_reward/mean": -0.12466391175985336,
"rewards/cosine_scaled_reward/std": 0.3854917585849762,
"rewards/format_reward/mean": 0.75,
"rewards/format_reward/std": 0.4364357888698578,
"step": 179
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2003.0,
"completions/mean_length": 1045.78125,
"completions/mean_terminated_length": 978.9667358398438,
"completions/min_length": 293.0,
"completions/min_terminated_length": 293.0,
"epoch": 0.2057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16482438147068024,
"learning_rate": 1.2988880807625927e-07,
"loss": 0.0338,
"num_tokens": 18849190.0,
"reward": 1.5156805515289307,
"reward_std": 0.7658263444900513,
"rewards/cosine_scaled_reward/mean": 0.27346524596214294,
"rewards/cosine_scaled_reward/std": 0.4864564538002014,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 180
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.234375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1929.0,
"completions/mean_length": 1285.703125,
"completions/mean_terminated_length": 1052.346923828125,
"completions/min_length": 473.0,
"completions/min_terminated_length": 473.0,
"epoch": 0.20685714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15365898609161377,
"learning_rate": 1.2713832064634125e-07,
"loss": 0.0144,
"num_tokens": 18942947.0,
"reward": 0.8511902093887329,
"reward_std": 0.5781452059745789,
"rewards/cosine_scaled_reward/mean": -0.004092369228601456,
"rewards/cosine_scaled_reward/std": 0.5130707621574402,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 181
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.3125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1312.578125,
"completions/mean_terminated_length": 978.2954711914062,
"completions/min_length": 436.0,
"completions/min_terminated_length": 436.0,
"epoch": 0.208,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14789460599422455,
"learning_rate": 1.2451664098030743e-07,
"loss": -0.0197,
"num_tokens": 19036744.0,
"reward": 0.43990767002105713,
"reward_std": 0.5781359076499939,
"rewards/cosine_scaled_reward/mean": -0.16285866498947144,
"rewards/cosine_scaled_reward/std": 0.3335002362728119,
"rewards/format_reward/mean": 0.765625,
"rewards/format_reward/std": 0.42695629596710205,
"step": 182
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1984.0,
"completions/mean_length": 961.703125,
"completions/mean_terminated_length": 944.4603881835938,
"completions/min_length": 407.0,
"completions/min_terminated_length": 407.0,
"epoch": 0.20914285714285713,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13262300193309784,
"learning_rate": 1.220245676671809e-07,
"loss": -0.0021,
"num_tokens": 19108909.0,
"reward": 1.2695767879486084,
"reward_std": 0.678351640701294,
"rewards/cosine_scaled_reward/mean": 0.1504133641719818,
"rewards/cosine_scaled_reward/std": 0.47719594836235046,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 183
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.046875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1879.0,
"completions/mean_length": 1119.015625,
"completions/mean_terminated_length": 1073.3277587890625,
"completions/min_length": 196.0,
"completions/min_terminated_length": 196.0,
"epoch": 0.2102857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14593224227428436,
"learning_rate": 1.1966285981663407e-07,
"loss": 0.0004,
"num_tokens": 19191878.0,
"reward": 0.6385983228683472,
"reward_std": 0.41251492500305176,
"rewards/cosine_scaled_reward/mean": -0.18070080876350403,
"rewards/cosine_scaled_reward/std": 0.3674013912677765,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 184
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.0625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1069.359375,
"completions/mean_terminated_length": 1004.11669921875,
"completions/min_length": 317.0,
"completions/min_terminated_length": 317.0,
"epoch": 0.21142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1318162977695465,
"learning_rate": 1.1743223682775649e-07,
"loss": 0.0032,
"num_tokens": 19270501.0,
"reward": 0.8208166360855103,
"reward_std": 0.5546972155570984,
"rewards/cosine_scaled_reward/mean": -0.08177915960550308,
"rewards/cosine_scaled_reward/std": 0.4313104748725891,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 185
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1826.0,
"completions/mean_length": 1249.34375,
"completions/mean_terminated_length": 1101.4444580078125,
"completions/min_length": 642.0,
"completions/min_terminated_length": 642.0,
"epoch": 0.21257142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15292362868785858,
"learning_rate": 1.1533337816991931e-07,
"loss": -0.0015,
"num_tokens": 19361155.0,
"reward": 0.9478594660758972,
"reward_std": 0.6171192526817322,
"rewards/cosine_scaled_reward/mean": 0.0520547591149807,
"rewards/cosine_scaled_reward/std": 0.46879369020462036,
"rewards/format_reward/mean": 0.84375,
"rewards/format_reward/std": 0.36596253514289856,
"step": 186
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.109375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1902.0,
"completions/mean_length": 1198.328125,
"completions/mean_terminated_length": 1093.982421875,
"completions/min_length": 439.0,
"completions/min_terminated_length": 439.0,
"epoch": 0.21371428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.1557895690202713,
"learning_rate": 1.1336692317580158e-07,
"loss": 0.0526,
"num_tokens": 19448272.0,
"reward": 0.8623722791671753,
"reward_std": 0.5490715503692627,
"rewards/cosine_scaled_reward/mean": -0.05318887531757355,
"rewards/cosine_scaled_reward/std": 0.4439024031162262,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 187
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2045.0,
"completions/mean_length": 1200.9375,
"completions/mean_terminated_length": 1062.3272705078125,
"completions/min_length": 385.0,
"completions/min_terminated_length": 385.0,
"epoch": 0.21485714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17476554214954376,
"learning_rate": 1.1153347084664419e-07,
"loss": 0.0307,
"num_tokens": 19536932.0,
"reward": 0.4879753887653351,
"reward_std": 0.5241237282752991,
"rewards/cosine_scaled_reward/mean": -0.21694980561733246,
"rewards/cosine_scaled_reward/std": 0.24976789951324463,
"rewards/format_reward/mean": 0.921875,
"rewards/format_reward/std": 0.27048972249031067,
"step": 188
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.078125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1620.0,
"completions/mean_length": 850.9375,
"completions/mean_terminated_length": 749.4915161132812,
"completions/min_length": 211.0,
"completions/min_terminated_length": 211.0,
"epoch": 0.216,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13221082091331482,
"learning_rate": 1.0983357966978745e-07,
"loss": 0.0039,
"num_tokens": 19600680.0,
"reward": 0.7197285890579224,
"reward_std": 0.6500153541564941,
"rewards/cosine_scaled_reward/mean": -0.12451067566871643,
"rewards/cosine_scaled_reward/std": 0.41736096143722534,
"rewards/format_reward/mean": 0.96875,
"rewards/format_reward/std": 0.17536810040473938,
"step": 189
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1999.0,
"completions/mean_length": 1033.84375,
"completions/mean_terminated_length": 1017.74609375,
"completions/min_length": 434.0,
"completions/min_terminated_length": 434.0,
"epoch": 0.21714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.12019925564527512,
"learning_rate": 1.0826776744855121e-07,
"loss": -0.0065,
"num_tokens": 19676614.0,
"reward": 1.0075644254684448,
"reward_std": 0.5787118673324585,
"rewards/cosine_scaled_reward/mean": 0.003782205283641815,
"rewards/cosine_scaled_reward/std": 0.4871532618999481,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 190
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1841.0,
"completions/mean_length": 932.953125,
"completions/mean_terminated_length": 915.2540283203125,
"completions/min_length": 370.0,
"completions/min_terminated_length": 370.0,
"epoch": 0.21828571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13237522542476654,
"learning_rate": 1.068365111445064e-07,
"loss": -0.0122,
"num_tokens": 19746611.0,
"reward": 1.093684196472168,
"reward_std": 0.551490068435669,
"rewards/cosine_scaled_reward/mean": 0.054654598236083984,
"rewards/cosine_scaled_reward/std": 0.45679154992103577,
"rewards/format_reward/mean": 0.984375,
"rewards/format_reward/std": 0.125,
"step": 191
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.21875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1955.0,
"completions/mean_length": 1365.53125,
"completions/mean_terminated_length": 1174.43994140625,
"completions/min_length": 488.0,
"completions/min_terminated_length": 488.0,
"epoch": 0.21942857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.15830805897712708,
"learning_rate": 1.0554024673218806e-07,
"loss": 0.0555,
"num_tokens": 19845301.0,
"reward": 0.5910857915878296,
"reward_std": 0.5397718548774719,
"rewards/cosine_scaled_reward/mean": -0.1341446042060852,
"rewards/cosine_scaled_reward/std": 0.3709484338760376,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 192
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1498.09375,
"completions/mean_terminated_length": 1189.6097412109375,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.22057142857142858,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.192939892411232,
"learning_rate": 1.0437936906629334e-07,
"loss": 0.0325,
"num_tokens": 19952323.0,
"reward": 0.4451579749584198,
"reward_std": 0.6873714923858643,
"rewards/cosine_scaled_reward/mean": -0.1133585125207901,
"rewards/cosine_scaled_reward/std": 0.39295101165771484,
"rewards/format_reward/mean": 0.671875,
"rewards/format_reward/std": 0.4732423722743988,
"step": 193
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.359375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1884.0,
"completions/mean_length": 1450.703125,
"completions/mean_terminated_length": 1115.6341552734375,
"completions/min_length": 496.0,
"completions/min_terminated_length": 496.0,
"epoch": 0.22171428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.16667421162128448,
"learning_rate": 1.0335423176140511e-07,
"loss": 0.0186,
"num_tokens": 20056528.0,
"reward": 0.7569496035575867,
"reward_std": 0.7739458084106445,
"rewards/cosine_scaled_reward/mean": 0.03472479432821274,
"rewards/cosine_scaled_reward/std": 0.4840702414512634,
"rewards/format_reward/mean": 0.6875,
"rewards/format_reward/std": 0.467176616191864,
"step": 194
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.171875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2038.0,
"completions/mean_length": 1248.28125,
"completions/mean_terminated_length": 1082.3018798828125,
"completions/min_length": 302.0,
"completions/min_terminated_length": 302.0,
"epoch": 0.22285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14721666276454926,
"learning_rate": 1.0246514708427701e-07,
"loss": 0.0652,
"num_tokens": 20147562.0,
"reward": 0.750861406326294,
"reward_std": 0.7012320756912231,
"rewards/cosine_scaled_reward/mean": -0.07769429683685303,
"rewards/cosine_scaled_reward/std": 0.406501442193985,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 195
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1995.0,
"completions/mean_length": 1270.9375,
"completions/mean_terminated_length": 1127.0369873046875,
"completions/min_length": 485.0,
"completions/min_terminated_length": 485.0,
"epoch": 0.224,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.17459681630134583,
"learning_rate": 1.017123858587145e-07,
"loss": 0.0453,
"num_tokens": 20240494.0,
"reward": 0.7895511388778687,
"reward_std": 0.8231704235076904,
"rewards/cosine_scaled_reward/mean": -0.03491191938519478,
"rewards/cosine_scaled_reward/std": 0.452151894569397,
"rewards/format_reward/mean": 0.859375,
"rewards/format_reward/std": 0.3503824472427368,
"step": 196
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.015625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1922.0,
"completions/mean_length": 960.125,
"completions/mean_terminated_length": 942.857177734375,
"completions/min_length": 332.0,
"completions/min_terminated_length": 332.0,
"epoch": 0.22514285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13420052826404572,
"learning_rate": 1.0109617738307911e-07,
"loss": 0.0164,
"num_tokens": 20312310.0,
"reward": 1.2753715515136719,
"reward_std": 0.8778545260429382,
"rewards/cosine_scaled_reward/mean": 0.13768577575683594,
"rewards/cosine_scaled_reward/std": 0.5257315039634705,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 197
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 1202.5,
"completions/mean_terminated_length": 1045.9259033203125,
"completions/min_length": 360.0,
"completions/min_terminated_length": 360.0,
"epoch": 0.22628571428571428,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14797629415988922,
"learning_rate": 1.0061670936044178e-07,
"loss": 0.0582,
"num_tokens": 20400774.0,
"reward": 0.7811780571937561,
"reward_std": 0.6028587818145752,
"rewards/cosine_scaled_reward/mean": -0.06253597885370255,
"rewards/cosine_scaled_reward/std": 0.4776788353919983,
"rewards/format_reward/mean": 0.90625,
"rewards/format_reward/std": 0.29378482699394226,
"step": 198
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.140625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2010.0,
"completions/mean_length": 1291.5,
"completions/mean_terminated_length": 1167.7091064453125,
"completions/min_length": 558.0,
"completions/min_terminated_length": 558.0,
"epoch": 0.22742857142857142,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.13744986057281494,
"learning_rate": 1.002741278414069e-07,
"loss": -0.0106,
"num_tokens": 20494934.0,
"reward": 0.6700541377067566,
"reward_std": 0.7563885450363159,
"rewards/cosine_scaled_reward/mean": -0.1649729162454605,
"rewards/cosine_scaled_reward/std": 0.3852999210357666,
"rewards/format_reward/mean": 1.0,
"rewards/format_reward/std": 0.0,
"step": 199
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.15625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1129.3125,
"completions/mean_terminated_length": 959.1851806640625,
"completions/min_length": 177.0,
"completions/min_terminated_length": 177.0,
"epoch": 0.22857142857142856,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.14575229585170746,
"learning_rate": 1.0006853717962393e-07,
"loss": 0.0553,
"num_tokens": 20577330.0,
"reward": 1.0288422107696533,
"reward_std": 0.7649609446525574,
"rewards/cosine_scaled_reward/mean": 0.04567110538482666,
"rewards/cosine_scaled_reward/std": 0.48820799589157104,
"rewards/format_reward/mean": 0.9375,
"rewards/format_reward/std": 0.24397502839565277,
"step": 200
},
{
"epoch": 0.22857142857142856,
"step": 200,
"total_flos": 0.0,
"train_loss": 0.03278075817739591,
"train_runtime": 11302.2127,
"train_samples_per_second": 1.133,
"train_steps_per_second": 0.018
}
],
"logging_steps": 1,
"max_steps": 200,
"num_input_tokens_seen": 20577330,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": true
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}