TT-GRPO / checkpoint-50 /trainer_state.json
LLucass's picture
Training in progress, step 50, checkpoint
fd34839 verified
{
"best_global_step": null,
"best_metric": null,
"best_model_checkpoint": null,
"epoch": 0.05714285714285714,
"eval_steps": 500,
"global_step": 50,
"is_hyper_param_search": false,
"is_local_process_zero": true,
"is_world_process_zero": true,
"log_history": [
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.671875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1734.0,
"completions/mean_length": 1702.03125,
"completions/mean_terminated_length": 993.6190795898438,
"completions/min_length": 483.0,
"completions/min_terminated_length": 483.0,
"epoch": 0.001142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2837146520614624,
"learning_rate": 0.0,
"loss": -0.0,
"num_tokens": 118418.0,
"reward": -0.09800112247467041,
"reward_std": 0.3028089702129364,
"rewards/cosine_scaled_reward/mean": -0.09800112992525101,
"rewards/cosine_scaled_reward/std": 0.37953105568885803,
"step": 1
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1894.0,
"completions/mean_length": 1738.90625,
"completions/mean_terminated_length": 949.0,
"completions/min_length": 435.0,
"completions/min_terminated_length": 435.0,
"epoch": 0.002285714285714286,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24220912158489227,
"learning_rate": 2e-08,
"loss": -0.0,
"num_tokens": 239748.0,
"reward": 0.020556632429361343,
"reward_std": 0.3545936942100525,
"rewards/cosine_scaled_reward/mean": 0.020556632429361343,
"rewards/cosine_scaled_reward/std": 0.4492928683757782,
"step": 2
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 869.0,
"completions/mean_length": 1946.515625,
"completions/mean_terminated_length": 749.0,
"completions/min_length": 609.0,
"completions/min_terminated_length": 609.0,
"epoch": 0.0034285714285714284,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24765528738498688,
"learning_rate": 4e-08,
"loss": -0.0,
"num_tokens": 374797.0,
"reward": -0.20057085156440735,
"reward_std": 0.13691216707229614,
"rewards/cosine_scaled_reward/mean": -0.20057085156440735,
"rewards/cosine_scaled_reward/std": 0.16282624006271362,
"step": 3
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.578125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1983.0,
"completions/mean_length": 1592.0,
"completions/mean_terminated_length": 967.1111450195312,
"completions/min_length": 516.0,
"completions/min_terminated_length": 516.0,
"epoch": 0.004571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28862521052360535,
"learning_rate": 6e-08,
"loss": 0.0,
"num_tokens": 486493.0,
"reward": -0.19111667573451996,
"reward_std": 0.19739457964897156,
"rewards/cosine_scaled_reward/mean": -0.19111669063568115,
"rewards/cosine_scaled_reward/std": 0.22545036673545837,
"step": 4
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 1976.578125,
"completions/mean_terminated_length": 1395.0001220703125,
"completions/min_length": 610.0,
"completions/min_terminated_length": 610.0,
"epoch": 0.005714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23521216213703156,
"learning_rate": 8e-08,
"loss": 0.0,
"num_tokens": 623810.0,
"reward": -0.2342512309551239,
"reward_std": 0.16005605459213257,
"rewards/cosine_scaled_reward/mean": -0.2342512309551239,
"rewards/cosine_scaled_reward/std": 0.20709452033042908,
"step": 5
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1987.0,
"completions/mean_length": 1840.125,
"completions/mean_terminated_length": 939.3333740234375,
"completions/min_length": 552.0,
"completions/min_terminated_length": 552.0,
"epoch": 0.006857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2831529676914215,
"learning_rate": 1e-07,
"loss": 0.0,
"num_tokens": 753226.0,
"reward": -0.1443408578634262,
"reward_std": 0.25838011503219604,
"rewards/cosine_scaled_reward/mean": -0.1443408727645874,
"rewards/cosine_scaled_reward/std": 0.3164331316947937,
"step": 6
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 1974.265625,
"completions/mean_terminated_length": 1458.125,
"completions/min_length": 1153.0,
"completions/min_terminated_length": 1153.0,
"epoch": 0.008,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22311581671237946,
"learning_rate": 1.2e-07,
"loss": 0.0,
"num_tokens": 889987.0,
"reward": -0.15585696697235107,
"reward_std": 0.21075330674648285,
"rewards/cosine_scaled_reward/mean": -0.15585698187351227,
"rewards/cosine_scaled_reward/std": 0.3327982723712921,
"step": 7
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1411.0,
"completions/mean_length": 1701.46875,
"completions/mean_terminated_length": 815.888916015625,
"completions/min_length": 346.0,
"completions/min_terminated_length": 346.0,
"epoch": 0.009142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23218390345573425,
"learning_rate": 1.4e-07,
"loss": -0.0,
"num_tokens": 1009297.0,
"reward": -0.019736051559448242,
"reward_std": 0.22464922070503235,
"rewards/cosine_scaled_reward/mean": -0.01973605342209339,
"rewards/cosine_scaled_reward/std": 0.46309077739715576,
"step": 8
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1836.0,
"completions/mean_length": 1936.96875,
"completions/mean_terminated_length": 1258.4444580078125,
"completions/min_length": 839.0,
"completions/min_terminated_length": 839.0,
"epoch": 0.010285714285714285,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2455250322818756,
"learning_rate": 1.6e-07,
"loss": -0.0,
"num_tokens": 1144719.0,
"reward": -0.22108668088912964,
"reward_std": 0.20550987124443054,
"rewards/cosine_scaled_reward/mean": -0.22108666598796844,
"rewards/cosine_scaled_reward/std": 0.27375248074531555,
"step": 9
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.6875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1579.0,
"completions/mean_length": 1662.0625,
"completions/mean_terminated_length": 813.0,
"completions/min_length": 389.0,
"completions/min_terminated_length": 389.0,
"epoch": 0.011428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26574036478996277,
"learning_rate": 1.8e-07,
"loss": -0.0,
"num_tokens": 1261923.0,
"reward": -0.140568345785141,
"reward_std": 0.2796468734741211,
"rewards/cosine_scaled_reward/mean": -0.140568345785141,
"rewards/cosine_scaled_reward/std": 0.35179150104522705,
"step": 10
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.921875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1335.0,
"completions/mean_length": 1970.859375,
"completions/mean_terminated_length": 1060.5999755859375,
"completions/min_length": 906.0,
"completions/min_terminated_length": 906.0,
"epoch": 0.012571428571428572,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24890610575675964,
"learning_rate": 2e-07,
"loss": -0.0,
"num_tokens": 1399730.0,
"reward": -0.2551690638065338,
"reward_std": 0.16209062933921814,
"rewards/cosine_scaled_reward/mean": -0.2551690638065338,
"rewards/cosine_scaled_reward/std": 0.2319207787513733,
"step": 11
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2031.0,
"completions/mean_length": 1798.71875,
"completions/mean_terminated_length": 1322.8182373046875,
"completions/min_length": 724.0,
"completions/min_terminated_length": 724.0,
"epoch": 0.013714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2804766595363617,
"learning_rate": 2.1999999999999998e-07,
"loss": 0.0,
"num_tokens": 1525792.0,
"reward": -0.19796784222126007,
"reward_std": 0.30078738927841187,
"rewards/cosine_scaled_reward/mean": -0.19796785712242126,
"rewards/cosine_scaled_reward/std": 0.3346545696258545,
"step": 12
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1800.0,
"completions/mean_length": 1816.890625,
"completions/mean_terminated_length": 1123.5625,
"completions/min_length": 583.0,
"completions/min_terminated_length": 583.0,
"epoch": 0.014857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2471778392791748,
"learning_rate": 2.4e-07,
"loss": -0.0,
"num_tokens": 1653113.0,
"reward": -0.17365078628063202,
"reward_std": 0.23729698359966278,
"rewards/cosine_scaled_reward/mean": -0.17365078628063202,
"rewards/cosine_scaled_reward/std": 0.2726025879383087,
"step": 13
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1529.0,
"completions/mean_length": 1815.046875,
"completions/mean_terminated_length": 1171.0,
"completions/min_length": 639.0,
"completions/min_terminated_length": 639.0,
"epoch": 0.016,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22734108567237854,
"learning_rate": 2.6e-07,
"loss": 0.0,
"num_tokens": 1779884.0,
"reward": -0.086978480219841,
"reward_std": 0.2551291584968567,
"rewards/cosine_scaled_reward/mean": -0.0869784876704216,
"rewards/cosine_scaled_reward/std": 0.4508184790611267,
"step": 14
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1354.0,
"completions/mean_length": 1705.421875,
"completions/mean_terminated_length": 758.2941284179688,
"completions/min_length": 429.0,
"completions/min_terminated_length": 429.0,
"epoch": 0.017142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25105422735214233,
"learning_rate": 2.8e-07,
"loss": -0.0,
"num_tokens": 1899951.0,
"reward": 0.025415867567062378,
"reward_std": 0.13560885190963745,
"rewards/cosine_scaled_reward/mean": 0.025415875017642975,
"rewards/cosine_scaled_reward/std": 0.4663754105567932,
"step": 15
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 1.0,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 0.0,
"completions/mean_length": 2048.0,
"completions/mean_terminated_length": 0.0,
"completions/min_length": 2048.0,
"completions/min_terminated_length": 0.0,
"epoch": 0.018285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23334357142448425,
"learning_rate": 3e-07,
"loss": -0.0,
"num_tokens": 2041463.0,
"reward": -0.2220873385667801,
"reward_std": 0.17581966519355774,
"rewards/cosine_scaled_reward/mean": -0.2220873236656189,
"rewards/cosine_scaled_reward/std": 0.1694367378950119,
"step": 16
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.546875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1898.0,
"completions/mean_length": 1524.9375,
"completions/mean_terminated_length": 893.6551513671875,
"completions/min_length": 343.0,
"completions/min_terminated_length": 343.0,
"epoch": 0.019428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.33780622482299805,
"learning_rate": 3.2e-07,
"loss": -0.0,
"num_tokens": 2149579.0,
"reward": -0.026115939021110535,
"reward_std": 0.3175298571586609,
"rewards/cosine_scaled_reward/mean": -0.026115931570529938,
"rewards/cosine_scaled_reward/std": 0.4766712486743927,
"step": 17
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1635.0,
"completions/mean_length": 1771.34375,
"completions/mean_terminated_length": 1116.105224609375,
"completions/min_length": 538.0,
"completions/min_terminated_length": 538.0,
"epoch": 0.02057142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23123449087142944,
"learning_rate": 3.4000000000000003e-07,
"loss": -0.0,
"num_tokens": 2273321.0,
"reward": -0.15853706002235413,
"reward_std": 0.27896177768707275,
"rewards/cosine_scaled_reward/mean": -0.15853706002235413,
"rewards/cosine_scaled_reward/std": 0.3426607847213745,
"step": 18
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.734375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2013.0,
"completions/mean_length": 1811.953125,
"completions/mean_terminated_length": 1159.3529052734375,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.021714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25707289576530457,
"learning_rate": 3.6e-07,
"loss": -0.0,
"num_tokens": 2400542.0,
"reward": -0.052606794983148575,
"reward_std": 0.31571486592292786,
"rewards/cosine_scaled_reward/mean": -0.052606794983148575,
"rewards/cosine_scaled_reward/std": 0.44901713728904724,
"step": 19
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1807.0,
"completions/mean_length": 1632.953125,
"completions/mean_terminated_length": 840.5909423828125,
"completions/min_length": 379.0,
"completions/min_terminated_length": 379.0,
"epoch": 0.022857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25764355063438416,
"learning_rate": 3.7999999999999996e-07,
"loss": 0.0,
"num_tokens": 2516403.0,
"reward": -0.07391424477100372,
"reward_std": 0.2678168714046478,
"rewards/cosine_scaled_reward/mean": -0.07391423732042313,
"rewards/cosine_scaled_reward/std": 0.3888758718967438,
"step": 20
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1854.0,
"completions/mean_length": 1820.125,
"completions/mean_terminated_length": 1136.5,
"completions/min_length": 344.0,
"completions/min_terminated_length": 344.0,
"epoch": 0.024,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27439141273498535,
"learning_rate": 4e-07,
"loss": 0.0,
"num_tokens": 2643699.0,
"reward": -0.16270118951797485,
"reward_std": 0.22588439285755157,
"rewards/cosine_scaled_reward/mean": -0.16270118951797485,
"rewards/cosine_scaled_reward/std": 0.39143073558807373,
"step": 21
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.40625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1741.0,
"completions/mean_length": 1271.359375,
"completions/mean_terminated_length": 739.9736938476562,
"completions/min_length": 282.0,
"completions/min_terminated_length": 282.0,
"epoch": 0.025142857142857144,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.37971845269203186,
"learning_rate": 4.1999999999999995e-07,
"loss": -0.0,
"num_tokens": 2734082.0,
"reward": -0.00552794337272644,
"reward_std": 0.23386958241462708,
"rewards/cosine_scaled_reward/mean": -0.005527939647436142,
"rewards/cosine_scaled_reward/std": 0.4625597596168518,
"step": 22
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1935.0,
"completions/mean_length": 1670.296875,
"completions/mean_terminated_length": 1081.0799560546875,
"completions/min_length": 472.0,
"completions/min_terminated_length": 472.0,
"epoch": 0.026285714285714287,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.28573453426361084,
"learning_rate": 4.3999999999999997e-07,
"loss": 0.0,
"num_tokens": 2851773.0,
"reward": -0.18269123136997223,
"reward_std": 0.2168647199869156,
"rewards/cosine_scaled_reward/mean": -0.18269124627113342,
"rewards/cosine_scaled_reward/std": 0.2703794836997986,
"step": 23
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1802.0,
"completions/mean_length": 1757.296875,
"completions/mean_terminated_length": 1068.7894287109375,
"completions/min_length": 327.0,
"completions/min_terminated_length": 327.0,
"epoch": 0.027428571428571427,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2553797662258148,
"learning_rate": 4.6e-07,
"loss": 0.0,
"num_tokens": 2975168.0,
"reward": -0.23130035400390625,
"reward_std": 0.35076260566711426,
"rewards/cosine_scaled_reward/mean": -0.23130035400390625,
"rewards/cosine_scaled_reward/std": 0.3866168260574341,
"step": 24
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1584.0,
"completions/mean_length": 1744.28125,
"completions/mean_terminated_length": 833.125,
"completions/min_length": 504.0,
"completions/min_terminated_length": 504.0,
"epoch": 0.02857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2636294960975647,
"learning_rate": 4.8e-07,
"loss": -0.0,
"num_tokens": 3097098.0,
"reward": -0.19239474833011627,
"reward_std": 0.2867633104324341,
"rewards/cosine_scaled_reward/mean": -0.19239474833011627,
"rewards/cosine_scaled_reward/std": 0.347222238779068,
"step": 25
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2033.0,
"completions/mean_length": 1932.09375,
"completions/mean_terminated_length": 1477.3846435546875,
"completions/min_length": 895.0,
"completions/min_terminated_length": 895.0,
"epoch": 0.029714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22351376712322235,
"learning_rate": 5e-07,
"loss": -0.0,
"num_tokens": 3231384.0,
"reward": -0.006307817995548248,
"reward_std": 0.2015555500984192,
"rewards/cosine_scaled_reward/mean": -0.006307825446128845,
"rewards/cosine_scaled_reward/std": 0.4079793393611908,
"step": 26
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.8125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1974.0,
"completions/mean_length": 1899.25,
"completions/mean_terminated_length": 1254.666748046875,
"completions/min_length": 545.0,
"completions/min_terminated_length": 545.0,
"epoch": 0.030857142857142857,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2670150697231293,
"learning_rate": 5.2e-07,
"loss": -0.0,
"num_tokens": 3363224.0,
"reward": -0.22071197628974915,
"reward_std": 0.2118011713027954,
"rewards/cosine_scaled_reward/mean": -0.22071197628974915,
"rewards/cosine_scaled_reward/std": 0.2716290354728699,
"step": 27
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.75,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1767.609375,
"completions/mean_terminated_length": 926.4375,
"completions/min_length": 438.0,
"completions/min_terminated_length": 438.0,
"epoch": 0.032,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25918784737586975,
"learning_rate": 5.4e-07,
"loss": -0.0,
"num_tokens": 3486687.0,
"reward": -0.10919298231601715,
"reward_std": 0.2716072201728821,
"rewards/cosine_scaled_reward/mean": -0.10919298231601715,
"rewards/cosine_scaled_reward/std": 0.44544270634651184,
"step": 28
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.890625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1951.0,
"completions/mean_length": 1932.203125,
"completions/mean_terminated_length": 989.2857666015625,
"completions/min_length": 603.0,
"completions/min_terminated_length": 603.0,
"epoch": 0.03314285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.24401192367076874,
"learning_rate": 5.6e-07,
"loss": 0.0,
"num_tokens": 3620820.0,
"reward": -0.19096782803535461,
"reward_std": 0.15806984901428223,
"rewards/cosine_scaled_reward/mean": -0.19096782803535461,
"rewards/cosine_scaled_reward/std": 0.181764155626297,
"step": 29
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1928.0,
"completions/mean_length": 1880.71875,
"completions/mean_terminated_length": 1334.2667236328125,
"completions/min_length": 604.0,
"completions/min_terminated_length": 604.0,
"epoch": 0.03428571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22094956040382385,
"learning_rate": 5.8e-07,
"loss": -0.0,
"num_tokens": 3751722.0,
"reward": -0.21267297863960266,
"reward_std": 0.24843861162662506,
"rewards/cosine_scaled_reward/mean": -0.21267297863960266,
"rewards/cosine_scaled_reward/std": 0.29802343249320984,
"step": 30
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.78125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1348.0,
"completions/mean_length": 1786.234375,
"completions/mean_terminated_length": 851.357177734375,
"completions/min_length": 355.0,
"completions/min_terminated_length": 355.0,
"epoch": 0.03542857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2912121117115021,
"learning_rate": 6e-07,
"loss": -0.0,
"num_tokens": 3876537.0,
"reward": -0.2621557414531708,
"reward_std": 0.18612943589687347,
"rewards/cosine_scaled_reward/mean": -0.2621557414531708,
"rewards/cosine_scaled_reward/std": 0.22891530394554138,
"step": 31
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1841.0,
"completions/mean_length": 1948.765625,
"completions/mean_terminated_length": 1342.3333740234375,
"completions/min_length": 536.0,
"completions/min_terminated_length": 536.0,
"epoch": 0.036571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2303810715675354,
"learning_rate": 6.2e-07,
"loss": 0.0,
"num_tokens": 4011610.0,
"reward": -0.1655973494052887,
"reward_std": 0.2392224669456482,
"rewards/cosine_scaled_reward/mean": -0.1655973345041275,
"rewards/cosine_scaled_reward/std": 0.3260692358016968,
"step": 32
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.90625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1972.0,
"completions/mean_length": 1984.0,
"completions/mean_terminated_length": 1365.3333740234375,
"completions/min_length": 965.0,
"completions/min_terminated_length": 965.0,
"epoch": 0.037714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.23169051110744476,
"learning_rate": 6.4e-07,
"loss": 0.0,
"num_tokens": 4149802.0,
"reward": -0.22799505293369293,
"reward_std": 0.24000275135040283,
"rewards/cosine_scaled_reward/mean": -0.22799506783485413,
"rewards/cosine_scaled_reward/std": 0.30748653411865234,
"step": 33
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.609375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1977.0,
"completions/mean_length": 1700.859375,
"completions/mean_terminated_length": 1159.3199462890625,
"completions/min_length": 433.0,
"completions/min_terminated_length": 433.0,
"epoch": 0.038857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2647433578968048,
"learning_rate": 6.6e-07,
"loss": 0.0,
"num_tokens": 4268209.0,
"reward": -0.07232969254255295,
"reward_std": 0.3570185899734497,
"rewards/cosine_scaled_reward/mean": -0.07232969999313354,
"rewards/cosine_scaled_reward/std": 0.4520716369152069,
"step": 34
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1458.0,
"completions/mean_length": 1884.625,
"completions/mean_terminated_length": 741.0,
"completions/min_length": 358.0,
"completions/min_terminated_length": 358.0,
"epoch": 0.04,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2681647539138794,
"learning_rate": 6.800000000000001e-07,
"loss": 0.0,
"num_tokens": 4400321.0,
"reward": -0.21119418740272522,
"reward_std": 0.2156996876001358,
"rewards/cosine_scaled_reward/mean": -0.21119415760040283,
"rewards/cosine_scaled_reward/std": 0.304564893245697,
"step": 35
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.96875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2002.0,
"completions/mean_length": 2032.765625,
"completions/mean_terminated_length": 1560.5,
"completions/min_length": 1119.0,
"completions/min_terminated_length": 1119.0,
"epoch": 0.04114285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25201615691185,
"learning_rate": 7e-07,
"loss": -0.0,
"num_tokens": 4541530.0,
"reward": -0.2148258090019226,
"reward_std": 0.1970210075378418,
"rewards/cosine_scaled_reward/mean": -0.2148257941007614,
"rewards/cosine_scaled_reward/std": 0.21921320259571075,
"step": 36
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.859375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2005.0,
"completions/mean_length": 1954.5,
"completions/mean_terminated_length": 1383.111083984375,
"completions/min_length": 901.0,
"completions/min_terminated_length": 901.0,
"epoch": 0.04228571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.29214274883270264,
"learning_rate": 7.2e-07,
"loss": 0.0,
"num_tokens": 4677642.0,
"reward": -0.23519155383110046,
"reward_std": 0.14085054397583008,
"rewards/cosine_scaled_reward/mean": -0.23519155383110046,
"rewards/cosine_scaled_reward/std": 0.17065586149692535,
"step": 37
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2016.0,
"completions/mean_length": 1949.1875,
"completions/mean_terminated_length": 1257.5,
"completions/min_length": 1042.0,
"completions/min_terminated_length": 1042.0,
"epoch": 0.04342857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2337840050458908,
"learning_rate": 7.4e-07,
"loss": -0.0,
"num_tokens": 4814102.0,
"reward": -0.16185586154460907,
"reward_std": 0.19152981042861938,
"rewards/cosine_scaled_reward/mean": -0.16185584664344788,
"rewards/cosine_scaled_reward/std": 0.3005273640155792,
"step": 38
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.828125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1953.0,
"completions/mean_length": 1810.515625,
"completions/mean_terminated_length": 666.2727661132812,
"completions/min_length": 246.0,
"completions/min_terminated_length": 246.0,
"epoch": 0.044571428571428574,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.246645987033844,
"learning_rate": 7.599999999999999e-07,
"loss": 0.0,
"num_tokens": 4940759.0,
"reward": -0.10980962216854095,
"reward_std": 0.18094567954540253,
"rewards/cosine_scaled_reward/mean": -0.10980962216854095,
"rewards/cosine_scaled_reward/std": 0.3624936640262604,
"step": 39
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.65625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1754.0,
"completions/mean_length": 1700.796875,
"completions/mean_terminated_length": 1037.95458984375,
"completions/min_length": 524.0,
"completions/min_terminated_length": 524.0,
"epoch": 0.045714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.26321786642074585,
"learning_rate": 7.799999999999999e-07,
"loss": -0.0,
"num_tokens": 5059682.0,
"reward": -0.14547404646873474,
"reward_std": 0.22270715236663818,
"rewards/cosine_scaled_reward/mean": -0.14547404646873474,
"rewards/cosine_scaled_reward/std": 0.4000875651836395,
"step": 40
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1939.0,
"completions/mean_length": 1860.328125,
"completions/mean_terminated_length": 1415.8421630859375,
"completions/min_length": 982.0,
"completions/min_terminated_length": 982.0,
"epoch": 0.046857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.21273446083068848,
"learning_rate": 8e-07,
"loss": -0.0,
"num_tokens": 5189895.0,
"reward": -0.24220962822437286,
"reward_std": 0.27360057830810547,
"rewards/cosine_scaled_reward/mean": -0.24220961332321167,
"rewards/cosine_scaled_reward/std": 0.33429500460624695,
"step": 41
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1547.0,
"completions/mean_length": 1694.375,
"completions/mean_terminated_length": 539.2000122070312,
"completions/min_length": 131.0,
"completions/min_terminated_length": 131.0,
"epoch": 0.048,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3549652099609375,
"learning_rate": 8.199999999999999e-07,
"loss": -0.0,
"num_tokens": 5308695.0,
"reward": -0.22589105367660522,
"reward_std": 0.16009008884429932,
"rewards/cosine_scaled_reward/mean": -0.22589105367660522,
"rewards/cosine_scaled_reward/std": 0.17985297739505768,
"step": 42
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.796875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 2006.0,
"completions/mean_length": 1824.75,
"completions/mean_terminated_length": 948.923095703125,
"completions/min_length": 473.0,
"completions/min_terminated_length": 473.0,
"epoch": 0.04914285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25625720620155334,
"learning_rate": 8.399999999999999e-07,
"loss": -0.0,
"num_tokens": 5437095.0,
"reward": -0.10874830186367035,
"reward_std": 0.2326180636882782,
"rewards/cosine_scaled_reward/mean": -0.10874830186367035,
"rewards/cosine_scaled_reward/std": 0.3275902569293976,
"step": 43
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.703125,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1751.0,
"completions/mean_length": 1673.734375,
"completions/mean_terminated_length": 787.3157958984375,
"completions/min_length": 484.0,
"completions/min_terminated_length": 484.0,
"epoch": 0.05028571428571429,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3032245934009552,
"learning_rate": 8.599999999999999e-07,
"loss": 0.0,
"num_tokens": 5554910.0,
"reward": -0.1157154068350792,
"reward_std": 0.2323075234889984,
"rewards/cosine_scaled_reward/mean": -0.1157153993844986,
"rewards/cosine_scaled_reward/std": 0.4071435034275055,
"step": 44
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.9375,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1931.0,
"completions/mean_length": 2031.03125,
"completions/mean_terminated_length": 1776.5,
"completions/min_length": 1421.0,
"completions/min_terminated_length": 1421.0,
"epoch": 0.05142857142857143,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2320922464132309,
"learning_rate": 8.799999999999999e-07,
"loss": 0.0,
"num_tokens": 5696552.0,
"reward": -0.22731460630893707,
"reward_std": 0.19835877418518066,
"rewards/cosine_scaled_reward/mean": -0.22731460630893707,
"rewards/cosine_scaled_reward/std": 0.28479474782943726,
"step": 45
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 981.0,
"completions/mean_length": 1890.3125,
"completions/mean_terminated_length": 786.5,
"completions/min_length": 490.0,
"completions/min_terminated_length": 490.0,
"epoch": 0.052571428571428575,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.2494276612997055,
"learning_rate": 9e-07,
"loss": 0.0,
"num_tokens": 5828700.0,
"reward": -0.23243775963783264,
"reward_std": 0.18319474160671234,
"rewards/cosine_scaled_reward/mean": -0.23243777453899384,
"rewards/cosine_scaled_reward/std": 0.20973731577396393,
"step": 46
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.71875,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1398.0,
"completions/mean_length": 1672.09375,
"completions/mean_terminated_length": 711.4444580078125,
"completions/min_length": 303.0,
"completions/min_terminated_length": 303.0,
"epoch": 0.053714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.3419908881187439,
"learning_rate": 9.2e-07,
"loss": 0.0,
"num_tokens": 5946114.0,
"reward": -0.16157878935337067,
"reward_std": 0.24494563043117523,
"rewards/cosine_scaled_reward/mean": -0.16157880425453186,
"rewards/cosine_scaled_reward/std": 0.39992472529411316,
"step": 47
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1501.0,
"completions/mean_length": 1787.171875,
"completions/mean_terminated_length": 935.1333618164062,
"completions/min_length": 687.0,
"completions/min_terminated_length": 687.0,
"epoch": 0.054857142857142854,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.25991642475128174,
"learning_rate": 9.399999999999999e-07,
"loss": 0.0,
"num_tokens": 6071037.0,
"reward": -0.1829870045185089,
"reward_std": 0.2542135417461395,
"rewards/cosine_scaled_reward/mean": -0.1829870045185089,
"rewards/cosine_scaled_reward/std": 0.30597779154777527,
"step": 48
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.5625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1981.0,
"completions/mean_length": 1565.34375,
"completions/mean_terminated_length": 944.7857666015625,
"completions/min_length": 322.0,
"completions/min_terminated_length": 322.0,
"epoch": 0.056,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.27452352643013,
"learning_rate": 9.6e-07,
"loss": 0.0,
"num_tokens": 6181283.0,
"reward": -0.22301900386810303,
"reward_std": 0.25131016969680786,
"rewards/cosine_scaled_reward/mean": -0.22301900386810303,
"rewards/cosine_scaled_reward/std": 0.2918049991130829,
"step": 49
},
{
"clip_ratio/high_max": 0.0,
"clip_ratio/high_mean": 0.0,
"clip_ratio/low_mean": 0.0,
"clip_ratio/low_min": 0.0,
"clip_ratio/region_mean": 0.0,
"completions/clipped_ratio": 0.765625,
"completions/max_length": 2048.0,
"completions/max_terminated_length": 1947.0,
"completions/mean_length": 1775.515625,
"completions/mean_terminated_length": 885.4000244140625,
"completions/min_length": 280.0,
"completions/min_terminated_length": 280.0,
"epoch": 0.05714285714285714,
"frac_reward_zero_std": 0.0,
"grad_norm": 0.22758428752422333,
"learning_rate": 9.8e-07,
"loss": 0.0,
"num_tokens": 6305732.0,
"reward": -0.10754476487636566,
"reward_std": 0.18711507320404053,
"rewards/cosine_scaled_reward/mean": -0.10754477977752686,
"rewards/cosine_scaled_reward/std": 0.39105597138404846,
"step": 50
}
],
"logging_steps": 1,
"max_steps": 500,
"num_input_tokens_seen": 6305732,
"num_train_epochs": 1,
"save_steps": 50,
"stateful_callbacks": {
"TrainerControl": {
"args": {
"should_epoch_stop": false,
"should_evaluate": false,
"should_log": false,
"should_save": true,
"should_training_stop": false
},
"attributes": {}
}
},
"total_flos": 0.0,
"train_batch_size": 4,
"trial_name": null,
"trial_params": null
}