| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.05714285714285714, |
| "eval_steps": 500, |
| "global_step": 50, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2837146520614624, |
| "learning_rate": 0.0, |
| "loss": -0.0, |
| "num_tokens": 118418.0, |
| "reward": -0.09800112247467041, |
| "reward_std": 0.3028089702129364, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24220912158489227, |
| "learning_rate": 2e-08, |
| "loss": -0.0, |
| "num_tokens": 239748.0, |
| "reward": 0.020556632429361343, |
| "reward_std": 0.3545936942100525, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 869.0, |
| "completions/mean_length": 1946.515625, |
| "completions/mean_terminated_length": 749.0, |
| "completions/min_length": 609.0, |
| "completions/min_terminated_length": 609.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24765528738498688, |
| "learning_rate": 4e-08, |
| "loss": -0.0, |
| "num_tokens": 374797.0, |
| "reward": -0.20057085156440735, |
| "reward_std": 0.13691216707229614, |
| "rewards/cosine_scaled_reward/mean": -0.20057085156440735, |
| "rewards/cosine_scaled_reward/std": 0.16282624006271362, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1983.0, |
| "completions/mean_length": 1592.0, |
| "completions/mean_terminated_length": 967.1111450195312, |
| "completions/min_length": 516.0, |
| "completions/min_terminated_length": 516.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28862521052360535, |
| "learning_rate": 6e-08, |
| "loss": 0.0, |
| "num_tokens": 486493.0, |
| "reward": -0.19111667573451996, |
| "reward_std": 0.19739457964897156, |
| "rewards/cosine_scaled_reward/mean": -0.19111669063568115, |
| "rewards/cosine_scaled_reward/std": 0.22545036673545837, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1939.0, |
| "completions/mean_length": 1976.578125, |
| "completions/mean_terminated_length": 1395.0001220703125, |
| "completions/min_length": 610.0, |
| "completions/min_terminated_length": 610.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23521216213703156, |
| "learning_rate": 8e-08, |
| "loss": 0.0, |
| "num_tokens": 623810.0, |
| "reward": -0.2342512309551239, |
| "reward_std": 0.16005605459213257, |
| "rewards/cosine_scaled_reward/mean": -0.2342512309551239, |
| "rewards/cosine_scaled_reward/std": 0.20709452033042908, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1840.125, |
| "completions/mean_terminated_length": 939.3333740234375, |
| "completions/min_length": 552.0, |
| "completions/min_terminated_length": 552.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2831529676914215, |
| "learning_rate": 1e-07, |
| "loss": 0.0, |
| "num_tokens": 753226.0, |
| "reward": -0.1443408578634262, |
| "reward_std": 0.25838011503219604, |
| "rewards/cosine_scaled_reward/mean": -0.1443408727645874, |
| "rewards/cosine_scaled_reward/std": 0.3164331316947937, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2002.0, |
| "completions/mean_length": 1974.265625, |
| "completions/mean_terminated_length": 1458.125, |
| "completions/min_length": 1153.0, |
| "completions/min_terminated_length": 1153.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22311581671237946, |
| "learning_rate": 1.2e-07, |
| "loss": 0.0, |
| "num_tokens": 889987.0, |
| "reward": -0.15585696697235107, |
| "reward_std": 0.21075330674648285, |
| "rewards/cosine_scaled_reward/mean": -0.15585698187351227, |
| "rewards/cosine_scaled_reward/std": 0.3327982723712921, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1411.0, |
| "completions/mean_length": 1701.46875, |
| "completions/mean_terminated_length": 815.888916015625, |
| "completions/min_length": 346.0, |
| "completions/min_terminated_length": 346.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23218390345573425, |
| "learning_rate": 1.4e-07, |
| "loss": -0.0, |
| "num_tokens": 1009297.0, |
| "reward": -0.019736051559448242, |
| "reward_std": 0.22464922070503235, |
| "rewards/cosine_scaled_reward/mean": -0.01973605342209339, |
| "rewards/cosine_scaled_reward/std": 0.46309077739715576, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1836.0, |
| "completions/mean_length": 1936.96875, |
| "completions/mean_terminated_length": 1258.4444580078125, |
| "completions/min_length": 839.0, |
| "completions/min_terminated_length": 839.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2455250322818756, |
| "learning_rate": 1.6e-07, |
| "loss": -0.0, |
| "num_tokens": 1144719.0, |
| "reward": -0.22108668088912964, |
| "reward_std": 0.20550987124443054, |
| "rewards/cosine_scaled_reward/mean": -0.22108666598796844, |
| "rewards/cosine_scaled_reward/std": 0.27375248074531555, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1579.0, |
| "completions/mean_length": 1662.0625, |
| "completions/mean_terminated_length": 813.0, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26574036478996277, |
| "learning_rate": 1.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1261923.0, |
| "reward": -0.140568345785141, |
| "reward_std": 0.2796468734741211, |
| "rewards/cosine_scaled_reward/mean": -0.140568345785141, |
| "rewards/cosine_scaled_reward/std": 0.35179150104522705, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1335.0, |
| "completions/mean_length": 1970.859375, |
| "completions/mean_terminated_length": 1060.5999755859375, |
| "completions/min_length": 906.0, |
| "completions/min_terminated_length": 906.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24890610575675964, |
| "learning_rate": 2e-07, |
| "loss": -0.0, |
| "num_tokens": 1399730.0, |
| "reward": -0.2551690638065338, |
| "reward_std": 0.16209062933921814, |
| "rewards/cosine_scaled_reward/mean": -0.2551690638065338, |
| "rewards/cosine_scaled_reward/std": 0.2319207787513733, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2031.0, |
| "completions/mean_length": 1798.71875, |
| "completions/mean_terminated_length": 1322.8182373046875, |
| "completions/min_length": 724.0, |
| "completions/min_terminated_length": 724.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2804766595363617, |
| "learning_rate": 2.1999999999999998e-07, |
| "loss": 0.0, |
| "num_tokens": 1525792.0, |
| "reward": -0.19796784222126007, |
| "reward_std": 0.30078738927841187, |
| "rewards/cosine_scaled_reward/mean": -0.19796785712242126, |
| "rewards/cosine_scaled_reward/std": 0.3346545696258545, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1800.0, |
| "completions/mean_length": 1816.890625, |
| "completions/mean_terminated_length": 1123.5625, |
| "completions/min_length": 583.0, |
| "completions/min_terminated_length": 583.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2471778392791748, |
| "learning_rate": 2.4e-07, |
| "loss": -0.0, |
| "num_tokens": 1653113.0, |
| "reward": -0.17365078628063202, |
| "reward_std": 0.23729698359966278, |
| "rewards/cosine_scaled_reward/mean": -0.17365078628063202, |
| "rewards/cosine_scaled_reward/std": 0.2726025879383087, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1529.0, |
| "completions/mean_length": 1815.046875, |
| "completions/mean_terminated_length": 1171.0, |
| "completions/min_length": 639.0, |
| "completions/min_terminated_length": 639.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22734108567237854, |
| "learning_rate": 2.6e-07, |
| "loss": 0.0, |
| "num_tokens": 1779884.0, |
| "reward": -0.086978480219841, |
| "reward_std": 0.2551291584968567, |
| "rewards/cosine_scaled_reward/mean": -0.0869784876704216, |
| "rewards/cosine_scaled_reward/std": 0.4508184790611267, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1354.0, |
| "completions/mean_length": 1705.421875, |
| "completions/mean_terminated_length": 758.2941284179688, |
| "completions/min_length": 429.0, |
| "completions/min_terminated_length": 429.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25105422735214233, |
| "learning_rate": 2.8e-07, |
| "loss": -0.0, |
| "num_tokens": 1899951.0, |
| "reward": 0.025415867567062378, |
| "reward_std": 0.13560885190963745, |
| "rewards/cosine_scaled_reward/mean": 0.025415875017642975, |
| "rewards/cosine_scaled_reward/std": 0.4663754105567932, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23334357142448425, |
| "learning_rate": 3e-07, |
| "loss": -0.0, |
| "num_tokens": 2041463.0, |
| "reward": -0.2220873385667801, |
| "reward_std": 0.17581966519355774, |
| "rewards/cosine_scaled_reward/mean": -0.2220873236656189, |
| "rewards/cosine_scaled_reward/std": 0.1694367378950119, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.546875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1898.0, |
| "completions/mean_length": 1524.9375, |
| "completions/mean_terminated_length": 893.6551513671875, |
| "completions/min_length": 343.0, |
| "completions/min_terminated_length": 343.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.33780622482299805, |
| "learning_rate": 3.2e-07, |
| "loss": -0.0, |
| "num_tokens": 2149579.0, |
| "reward": -0.026115939021110535, |
| "reward_std": 0.3175298571586609, |
| "rewards/cosine_scaled_reward/mean": -0.026115931570529938, |
| "rewards/cosine_scaled_reward/std": 0.4766712486743927, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1635.0, |
| "completions/mean_length": 1771.34375, |
| "completions/mean_terminated_length": 1116.105224609375, |
| "completions/min_length": 538.0, |
| "completions/min_terminated_length": 538.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23123449087142944, |
| "learning_rate": 3.4000000000000003e-07, |
| "loss": -0.0, |
| "num_tokens": 2273321.0, |
| "reward": -0.15853706002235413, |
| "reward_std": 0.27896177768707275, |
| "rewards/cosine_scaled_reward/mean": -0.15853706002235413, |
| "rewards/cosine_scaled_reward/std": 0.3426607847213745, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2013.0, |
| "completions/mean_length": 1811.953125, |
| "completions/mean_terminated_length": 1159.3529052734375, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25707289576530457, |
| "learning_rate": 3.6e-07, |
| "loss": -0.0, |
| "num_tokens": 2400542.0, |
| "reward": -0.052606794983148575, |
| "reward_std": 0.31571486592292786, |
| "rewards/cosine_scaled_reward/mean": -0.052606794983148575, |
| "rewards/cosine_scaled_reward/std": 0.44901713728904724, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1807.0, |
| "completions/mean_length": 1632.953125, |
| "completions/mean_terminated_length": 840.5909423828125, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25764355063438416, |
| "learning_rate": 3.7999999999999996e-07, |
| "loss": 0.0, |
| "num_tokens": 2516403.0, |
| "reward": -0.07391424477100372, |
| "reward_std": 0.2678168714046478, |
| "rewards/cosine_scaled_reward/mean": -0.07391423732042313, |
| "rewards/cosine_scaled_reward/std": 0.3888758718967438, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1854.0, |
| "completions/mean_length": 1820.125, |
| "completions/mean_terminated_length": 1136.5, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27439141273498535, |
| "learning_rate": 4e-07, |
| "loss": 0.0, |
| "num_tokens": 2643699.0, |
| "reward": -0.16270118951797485, |
| "reward_std": 0.22588439285755157, |
| "rewards/cosine_scaled_reward/mean": -0.16270118951797485, |
| "rewards/cosine_scaled_reward/std": 0.39143073558807373, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1741.0, |
| "completions/mean_length": 1271.359375, |
| "completions/mean_terminated_length": 739.9736938476562, |
| "completions/min_length": 282.0, |
| "completions/min_terminated_length": 282.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.37971845269203186, |
| "learning_rate": 4.1999999999999995e-07, |
| "loss": -0.0, |
| "num_tokens": 2734082.0, |
| "reward": -0.00552794337272644, |
| "reward_std": 0.23386958241462708, |
| "rewards/cosine_scaled_reward/mean": -0.005527939647436142, |
| "rewards/cosine_scaled_reward/std": 0.4625597596168518, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1935.0, |
| "completions/mean_length": 1670.296875, |
| "completions/mean_terminated_length": 1081.0799560546875, |
| "completions/min_length": 472.0, |
| "completions/min_terminated_length": 472.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.28573453426361084, |
| "learning_rate": 4.3999999999999997e-07, |
| "loss": 0.0, |
| "num_tokens": 2851773.0, |
| "reward": -0.18269123136997223, |
| "reward_std": 0.2168647199869156, |
| "rewards/cosine_scaled_reward/mean": -0.18269124627113342, |
| "rewards/cosine_scaled_reward/std": 0.2703794836997986, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1802.0, |
| "completions/mean_length": 1757.296875, |
| "completions/mean_terminated_length": 1068.7894287109375, |
| "completions/min_length": 327.0, |
| "completions/min_terminated_length": 327.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2553797662258148, |
| "learning_rate": 4.6e-07, |
| "loss": 0.0, |
| "num_tokens": 2975168.0, |
| "reward": -0.23130035400390625, |
| "reward_std": 0.35076260566711426, |
| "rewards/cosine_scaled_reward/mean": -0.23130035400390625, |
| "rewards/cosine_scaled_reward/std": 0.3866168260574341, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1584.0, |
| "completions/mean_length": 1744.28125, |
| "completions/mean_terminated_length": 833.125, |
| "completions/min_length": 504.0, |
| "completions/min_terminated_length": 504.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2636294960975647, |
| "learning_rate": 4.8e-07, |
| "loss": -0.0, |
| "num_tokens": 3097098.0, |
| "reward": -0.19239474833011627, |
| "reward_std": 0.2867633104324341, |
| "rewards/cosine_scaled_reward/mean": -0.19239474833011627, |
| "rewards/cosine_scaled_reward/std": 0.347222238779068, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2033.0, |
| "completions/mean_length": 1932.09375, |
| "completions/mean_terminated_length": 1477.3846435546875, |
| "completions/min_length": 895.0, |
| "completions/min_terminated_length": 895.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22351376712322235, |
| "learning_rate": 5e-07, |
| "loss": -0.0, |
| "num_tokens": 3231384.0, |
| "reward": -0.006307817995548248, |
| "reward_std": 0.2015555500984192, |
| "rewards/cosine_scaled_reward/mean": -0.006307825446128845, |
| "rewards/cosine_scaled_reward/std": 0.4079793393611908, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1899.25, |
| "completions/mean_terminated_length": 1254.666748046875, |
| "completions/min_length": 545.0, |
| "completions/min_terminated_length": 545.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2670150697231293, |
| "learning_rate": 5.2e-07, |
| "loss": -0.0, |
| "num_tokens": 3363224.0, |
| "reward": -0.22071197628974915, |
| "reward_std": 0.2118011713027954, |
| "rewards/cosine_scaled_reward/mean": -0.22071197628974915, |
| "rewards/cosine_scaled_reward/std": 0.2716290354728699, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1767.609375, |
| "completions/mean_terminated_length": 926.4375, |
| "completions/min_length": 438.0, |
| "completions/min_terminated_length": 438.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25918784737586975, |
| "learning_rate": 5.4e-07, |
| "loss": -0.0, |
| "num_tokens": 3486687.0, |
| "reward": -0.10919298231601715, |
| "reward_std": 0.2716072201728821, |
| "rewards/cosine_scaled_reward/mean": -0.10919298231601715, |
| "rewards/cosine_scaled_reward/std": 0.44544270634651184, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.890625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1951.0, |
| "completions/mean_length": 1932.203125, |
| "completions/mean_terminated_length": 989.2857666015625, |
| "completions/min_length": 603.0, |
| "completions/min_terminated_length": 603.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24401192367076874, |
| "learning_rate": 5.6e-07, |
| "loss": 0.0, |
| "num_tokens": 3620820.0, |
| "reward": -0.19096782803535461, |
| "reward_std": 0.15806984901428223, |
| "rewards/cosine_scaled_reward/mean": -0.19096782803535461, |
| "rewards/cosine_scaled_reward/std": 0.181764155626297, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1928.0, |
| "completions/mean_length": 1880.71875, |
| "completions/mean_terminated_length": 1334.2667236328125, |
| "completions/min_length": 604.0, |
| "completions/min_terminated_length": 604.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22094956040382385, |
| "learning_rate": 5.8e-07, |
| "loss": -0.0, |
| "num_tokens": 3751722.0, |
| "reward": -0.21267297863960266, |
| "reward_std": 0.24843861162662506, |
| "rewards/cosine_scaled_reward/mean": -0.21267297863960266, |
| "rewards/cosine_scaled_reward/std": 0.29802343249320984, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1348.0, |
| "completions/mean_length": 1786.234375, |
| "completions/mean_terminated_length": 851.357177734375, |
| "completions/min_length": 355.0, |
| "completions/min_terminated_length": 355.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2912121117115021, |
| "learning_rate": 6e-07, |
| "loss": -0.0, |
| "num_tokens": 3876537.0, |
| "reward": -0.2621557414531708, |
| "reward_std": 0.18612943589687347, |
| "rewards/cosine_scaled_reward/mean": -0.2621557414531708, |
| "rewards/cosine_scaled_reward/std": 0.22891530394554138, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1841.0, |
| "completions/mean_length": 1948.765625, |
| "completions/mean_terminated_length": 1342.3333740234375, |
| "completions/min_length": 536.0, |
| "completions/min_terminated_length": 536.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2303810715675354, |
| "learning_rate": 6.2e-07, |
| "loss": 0.0, |
| "num_tokens": 4011610.0, |
| "reward": -0.1655973494052887, |
| "reward_std": 0.2392224669456482, |
| "rewards/cosine_scaled_reward/mean": -0.1655973345041275, |
| "rewards/cosine_scaled_reward/std": 0.3260692358016968, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1972.0, |
| "completions/mean_length": 1984.0, |
| "completions/mean_terminated_length": 1365.3333740234375, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23169051110744476, |
| "learning_rate": 6.4e-07, |
| "loss": 0.0, |
| "num_tokens": 4149802.0, |
| "reward": -0.22799505293369293, |
| "reward_std": 0.24000275135040283, |
| "rewards/cosine_scaled_reward/mean": -0.22799506783485413, |
| "rewards/cosine_scaled_reward/std": 0.30748653411865234, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.609375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1977.0, |
| "completions/mean_length": 1700.859375, |
| "completions/mean_terminated_length": 1159.3199462890625, |
| "completions/min_length": 433.0, |
| "completions/min_terminated_length": 433.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2647433578968048, |
| "learning_rate": 6.6e-07, |
| "loss": 0.0, |
| "num_tokens": 4268209.0, |
| "reward": -0.07232969254255295, |
| "reward_std": 0.3570185899734497, |
| "rewards/cosine_scaled_reward/mean": -0.07232969999313354, |
| "rewards/cosine_scaled_reward/std": 0.4520716369152069, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1458.0, |
| "completions/mean_length": 1884.625, |
| "completions/mean_terminated_length": 741.0, |
| "completions/min_length": 358.0, |
| "completions/min_terminated_length": 358.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2681647539138794, |
| "learning_rate": 6.800000000000001e-07, |
| "loss": 0.0, |
| "num_tokens": 4400321.0, |
| "reward": -0.21119418740272522, |
| "reward_std": 0.2156996876001358, |
| "rewards/cosine_scaled_reward/mean": -0.21119415760040283, |
| "rewards/cosine_scaled_reward/std": 0.304564893245697, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.96875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2002.0, |
| "completions/mean_length": 2032.765625, |
| "completions/mean_terminated_length": 1560.5, |
| "completions/min_length": 1119.0, |
| "completions/min_terminated_length": 1119.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25201615691185, |
| "learning_rate": 7e-07, |
| "loss": -0.0, |
| "num_tokens": 4541530.0, |
| "reward": -0.2148258090019226, |
| "reward_std": 0.1970210075378418, |
| "rewards/cosine_scaled_reward/mean": -0.2148257941007614, |
| "rewards/cosine_scaled_reward/std": 0.21921320259571075, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1954.5, |
| "completions/mean_terminated_length": 1383.111083984375, |
| "completions/min_length": 901.0, |
| "completions/min_terminated_length": 901.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.29214274883270264, |
| "learning_rate": 7.2e-07, |
| "loss": 0.0, |
| "num_tokens": 4677642.0, |
| "reward": -0.23519155383110046, |
| "reward_std": 0.14085054397583008, |
| "rewards/cosine_scaled_reward/mean": -0.23519155383110046, |
| "rewards/cosine_scaled_reward/std": 0.17065586149692535, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1949.1875, |
| "completions/mean_terminated_length": 1257.5, |
| "completions/min_length": 1042.0, |
| "completions/min_terminated_length": 1042.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2337840050458908, |
| "learning_rate": 7.4e-07, |
| "loss": -0.0, |
| "num_tokens": 4814102.0, |
| "reward": -0.16185586154460907, |
| "reward_std": 0.19152981042861938, |
| "rewards/cosine_scaled_reward/mean": -0.16185584664344788, |
| "rewards/cosine_scaled_reward/std": 0.3005273640155792, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1953.0, |
| "completions/mean_length": 1810.515625, |
| "completions/mean_terminated_length": 666.2727661132812, |
| "completions/min_length": 246.0, |
| "completions/min_terminated_length": 246.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.246645987033844, |
| "learning_rate": 7.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 4940759.0, |
| "reward": -0.10980962216854095, |
| "reward_std": 0.18094567954540253, |
| "rewards/cosine_scaled_reward/mean": -0.10980962216854095, |
| "rewards/cosine_scaled_reward/std": 0.3624936640262604, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.65625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1754.0, |
| "completions/mean_length": 1700.796875, |
| "completions/mean_terminated_length": 1037.95458984375, |
| "completions/min_length": 524.0, |
| "completions/min_terminated_length": 524.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.26321786642074585, |
| "learning_rate": 7.799999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5059682.0, |
| "reward": -0.14547404646873474, |
| "reward_std": 0.22270715236663818, |
| "rewards/cosine_scaled_reward/mean": -0.14547404646873474, |
| "rewards/cosine_scaled_reward/std": 0.4000875651836395, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1939.0, |
| "completions/mean_length": 1860.328125, |
| "completions/mean_terminated_length": 1415.8421630859375, |
| "completions/min_length": 982.0, |
| "completions/min_terminated_length": 982.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21273446083068848, |
| "learning_rate": 8e-07, |
| "loss": -0.0, |
| "num_tokens": 5189895.0, |
| "reward": -0.24220962822437286, |
| "reward_std": 0.27360057830810547, |
| "rewards/cosine_scaled_reward/mean": -0.24220961332321167, |
| "rewards/cosine_scaled_reward/std": 0.33429500460624695, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1547.0, |
| "completions/mean_length": 1694.375, |
| "completions/mean_terminated_length": 539.2000122070312, |
| "completions/min_length": 131.0, |
| "completions/min_terminated_length": 131.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3549652099609375, |
| "learning_rate": 8.199999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5308695.0, |
| "reward": -0.22589105367660522, |
| "reward_std": 0.16009008884429932, |
| "rewards/cosine_scaled_reward/mean": -0.22589105367660522, |
| "rewards/cosine_scaled_reward/std": 0.17985297739505768, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.796875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2006.0, |
| "completions/mean_length": 1824.75, |
| "completions/mean_terminated_length": 948.923095703125, |
| "completions/min_length": 473.0, |
| "completions/min_terminated_length": 473.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25625720620155334, |
| "learning_rate": 8.399999999999999e-07, |
| "loss": -0.0, |
| "num_tokens": 5437095.0, |
| "reward": -0.10874830186367035, |
| "reward_std": 0.2326180636882782, |
| "rewards/cosine_scaled_reward/mean": -0.10874830186367035, |
| "rewards/cosine_scaled_reward/std": 0.3275902569293976, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1751.0, |
| "completions/mean_length": 1673.734375, |
| "completions/mean_terminated_length": 787.3157958984375, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3032245934009552, |
| "learning_rate": 8.599999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5554910.0, |
| "reward": -0.1157154068350792, |
| "reward_std": 0.2323075234889984, |
| "rewards/cosine_scaled_reward/mean": -0.1157153993844986, |
| "rewards/cosine_scaled_reward/std": 0.4071435034275055, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1931.0, |
| "completions/mean_length": 2031.03125, |
| "completions/mean_terminated_length": 1776.5, |
| "completions/min_length": 1421.0, |
| "completions/min_terminated_length": 1421.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2320922464132309, |
| "learning_rate": 8.799999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 5696552.0, |
| "reward": -0.22731460630893707, |
| "reward_std": 0.19835877418518066, |
| "rewards/cosine_scaled_reward/mean": -0.22731460630893707, |
| "rewards/cosine_scaled_reward/std": 0.28479474782943726, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 981.0, |
| "completions/mean_length": 1890.3125, |
| "completions/mean_terminated_length": 786.5, |
| "completions/min_length": 490.0, |
| "completions/min_terminated_length": 490.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2494276612997055, |
| "learning_rate": 9e-07, |
| "loss": 0.0, |
| "num_tokens": 5828700.0, |
| "reward": -0.23243775963783264, |
| "reward_std": 0.18319474160671234, |
| "rewards/cosine_scaled_reward/mean": -0.23243777453899384, |
| "rewards/cosine_scaled_reward/std": 0.20973731577396393, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1398.0, |
| "completions/mean_length": 1672.09375, |
| "completions/mean_terminated_length": 711.4444580078125, |
| "completions/min_length": 303.0, |
| "completions/min_terminated_length": 303.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.3419908881187439, |
| "learning_rate": 9.2e-07, |
| "loss": 0.0, |
| "num_tokens": 5946114.0, |
| "reward": -0.16157878935337067, |
| "reward_std": 0.24494563043117523, |
| "rewards/cosine_scaled_reward/mean": -0.16157880425453186, |
| "rewards/cosine_scaled_reward/std": 0.39992472529411316, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1501.0, |
| "completions/mean_length": 1787.171875, |
| "completions/mean_terminated_length": 935.1333618164062, |
| "completions/min_length": 687.0, |
| "completions/min_terminated_length": 687.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.25991642475128174, |
| "learning_rate": 9.399999999999999e-07, |
| "loss": 0.0, |
| "num_tokens": 6071037.0, |
| "reward": -0.1829870045185089, |
| "reward_std": 0.2542135417461395, |
| "rewards/cosine_scaled_reward/mean": -0.1829870045185089, |
| "rewards/cosine_scaled_reward/std": 0.30597779154777527, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1565.34375, |
| "completions/mean_terminated_length": 944.7857666015625, |
| "completions/min_length": 322.0, |
| "completions/min_terminated_length": 322.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.27452352643013, |
| "learning_rate": 9.6e-07, |
| "loss": 0.0, |
| "num_tokens": 6181283.0, |
| "reward": -0.22301900386810303, |
| "reward_std": 0.25131016969680786, |
| "rewards/cosine_scaled_reward/mean": -0.22301900386810303, |
| "rewards/cosine_scaled_reward/std": 0.2918049991130829, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1947.0, |
| "completions/mean_length": 1775.515625, |
| "completions/mean_terminated_length": 885.4000244140625, |
| "completions/min_length": 280.0, |
| "completions/min_terminated_length": 280.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22758428752422333, |
| "learning_rate": 9.8e-07, |
| "loss": 0.0, |
| "num_tokens": 6305732.0, |
| "reward": -0.10754476487636566, |
| "reward_std": 0.18711507320404053, |
| "rewards/cosine_scaled_reward/mean": -0.10754477977752686, |
| "rewards/cosine_scaled_reward/std": 0.39105597138404846, |
| "step": 50 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 500, |
| "num_input_tokens_seen": 6305732, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|