| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.22857142857142856, |
| "eval_steps": 500, |
| "global_step": 200, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1702.03125, |
| "completions/mean_terminated_length": 993.6190795898438, |
| "completions/min_length": 483.0, |
| "completions/min_terminated_length": 483.0, |
| "epoch": 0.001142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20054349303245544, |
| "learning_rate": 0.0, |
| "loss": 0.0427, |
| "num_tokens": 118418.0, |
| "reward": 0.17899775505065918, |
| "reward_std": 0.7650213241577148, |
| "rewards/cosine_scaled_reward/mean": -0.09800112992525101, |
| "rewards/cosine_scaled_reward/std": 0.37953105568885803, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 1 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1738.90625, |
| "completions/mean_terminated_length": 949.0, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.002285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19502359628677368, |
| "learning_rate": 5e-08, |
| "loss": 0.0561, |
| "num_tokens": 239748.0, |
| "reward": 0.3848632574081421, |
| "reward_std": 0.9111153483390808, |
| "rewards/cosine_scaled_reward/mean": 0.020556632429361343, |
| "rewards/cosine_scaled_reward/std": 0.4492928683757782, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 2 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1287.0, |
| "completions/mean_length": 1944.453125, |
| "completions/mean_terminated_length": 943.5, |
| "completions/min_length": 608.0, |
| "completions/min_terminated_length": 608.0, |
| "epoch": 0.0034285714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.230765700340271, |
| "learning_rate": 1e-07, |
| "loss": 0.0549, |
| "num_tokens": 374665.0, |
| "reward": -0.28856638073921204, |
| "reward_std": 0.4003669023513794, |
| "rewards/cosine_scaled_reward/mean": -0.19897069036960602, |
| "rewards/cosine_scaled_reward/std": 0.18252794444561005, |
| "rewards/format_reward/mean": 0.109375, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 3 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1871.0, |
| "completions/mean_length": 1592.3125, |
| "completions/mean_terminated_length": 1006.4285888671875, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.004571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20995420217514038, |
| "learning_rate": 1.5e-07, |
| "loss": 0.1266, |
| "num_tokens": 486381.0, |
| "reward": 0.20640414953231812, |
| "reward_std": 0.8193759918212891, |
| "rewards/cosine_scaled_reward/mean": -0.13117292523384094, |
| "rewards/cosine_scaled_reward/std": 0.35454094409942627, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 4 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1680.0, |
| "completions/mean_length": 2002.859375, |
| "completions/mean_terminated_length": 1085.0, |
| "completions/min_length": 755.0, |
| "completions/min_terminated_length": 755.0, |
| "epoch": 0.005714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23816199600696564, |
| "learning_rate": 2e-07, |
| "loss": 0.01, |
| "num_tokens": 625380.0, |
| "reward": -0.41131818294525146, |
| "reward_std": 0.30660682916641235, |
| "rewards/cosine_scaled_reward/mean": -0.24472159147262573, |
| "rewards/cosine_scaled_reward/std": 0.19079075753688812, |
| "rewards/format_reward/mean": 0.078125, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 5 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1344.0, |
| "completions/mean_length": 1890.0, |
| "completions/mean_terminated_length": 784.0, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.006857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.24285951256752014, |
| "learning_rate": 2.5e-07, |
| "loss": -0.0119, |
| "num_tokens": 757988.0, |
| "reward": -0.24828195571899414, |
| "reward_std": 0.3839319050312042, |
| "rewards/cosine_scaled_reward/mean": -0.19445347785949707, |
| "rewards/cosine_scaled_reward/std": 0.19692479074001312, |
| "rewards/format_reward/mean": 0.140625, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 6 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1941.0, |
| "completions/mean_length": 1935.046875, |
| "completions/mean_terminated_length": 1390.8182373046875, |
| "completions/min_length": 886.0, |
| "completions/min_terminated_length": 886.0, |
| "epoch": 0.008, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2183438539505005, |
| "learning_rate": 3e-07, |
| "loss": 0.0412, |
| "num_tokens": 892239.0, |
| "reward": -0.07044821977615356, |
| "reward_std": 0.5991545915603638, |
| "rewards/cosine_scaled_reward/mean": -0.14459910988807678, |
| "rewards/cosine_scaled_reward/std": 0.3703240156173706, |
| "rewards/format_reward/mean": 0.21875, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 7 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 1743.921875, |
| "completions/mean_terminated_length": 966.8333129882812, |
| "completions/min_length": 296.0, |
| "completions/min_terminated_length": 296.0, |
| "epoch": 0.009142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18490855395793915, |
| "learning_rate": 3.5e-07, |
| "loss": 0.0096, |
| "num_tokens": 1014266.0, |
| "reward": 0.07391861081123352, |
| "reward_std": 0.5062483549118042, |
| "rewards/cosine_scaled_reward/mean": -0.11929068714380264, |
| "rewards/cosine_scaled_reward/std": 0.4095526933670044, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 8 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1790.0, |
| "completions/mean_length": 1965.46875, |
| "completions/mean_terminated_length": 1461.111083984375, |
| "completions/min_length": 1029.0, |
| "completions/min_terminated_length": 1029.0, |
| "epoch": 0.010285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21707069873809814, |
| "learning_rate": 4e-07, |
| "loss": 0.0566, |
| "num_tokens": 1151512.0, |
| "reward": -0.15350507199764252, |
| "reward_std": 0.7245944738388062, |
| "rewards/cosine_scaled_reward/mean": -0.18612754344940186, |
| "rewards/cosine_scaled_reward/std": 0.30883485078811646, |
| "rewards/format_reward/mean": 0.21875, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 9 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1745.0, |
| "completions/mean_length": 1682.59375, |
| "completions/mean_terminated_length": 817.1578979492188, |
| "completions/min_length": 394.0, |
| "completions/min_terminated_length": 394.0, |
| "epoch": 0.011428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20094214379787445, |
| "learning_rate": 4.5e-07, |
| "loss": 0.0457, |
| "num_tokens": 1270030.0, |
| "reward": 0.027805477380752563, |
| "reward_std": 0.4805509150028229, |
| "rewards/cosine_scaled_reward/mean": -0.14234726130962372, |
| "rewards/cosine_scaled_reward/std": 0.26565250754356384, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 10 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.953125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1094.0, |
| "completions/mean_length": 1998.15625, |
| "completions/mean_terminated_length": 984.6666870117188, |
| "completions/min_length": 798.0, |
| "completions/min_terminated_length": 798.0, |
| "epoch": 0.012571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2170705795288086, |
| "learning_rate": 5e-07, |
| "loss": 0.0247, |
| "num_tokens": 1409584.0, |
| "reward": -0.43332377076148987, |
| "reward_std": 0.36288702487945557, |
| "rewards/cosine_scaled_reward/mean": -0.24791188538074493, |
| "rewards/cosine_scaled_reward/std": 0.17533892393112183, |
| "rewards/format_reward/mean": 0.0625, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 11 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2044.0, |
| "completions/mean_length": 1630.375, |
| "completions/mean_terminated_length": 1093.4285888671875, |
| "completions/min_length": 427.0, |
| "completions/min_terminated_length": 427.0, |
| "epoch": 0.013714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2160935252904892, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0753, |
| "num_tokens": 1524872.0, |
| "reward": 0.0067175328731536865, |
| "reward_std": 0.689138650894165, |
| "rewards/cosine_scaled_reward/mean": -0.22320374846458435, |
| "rewards/cosine_scaled_reward/std": 0.3645767867565155, |
| "rewards/format_reward/mean": 0.453125, |
| "rewards/format_reward/std": 0.501733124256134, |
| "step": 12 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1751.0, |
| "completions/mean_length": 1833.453125, |
| "completions/mean_terminated_length": 1067.21435546875, |
| "completions/min_length": 616.0, |
| "completions/min_terminated_length": 616.0, |
| "epoch": 0.014857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2122364640235901, |
| "learning_rate": 6e-07, |
| "loss": 0.0326, |
| "num_tokens": 1653253.0, |
| "reward": -0.09265299141407013, |
| "reward_std": 0.5985201001167297, |
| "rewards/cosine_scaled_reward/mean": -0.17913900315761566, |
| "rewards/cosine_scaled_reward/std": 0.306300550699234, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 13 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1897.0, |
| "completions/mean_length": 1823.40625, |
| "completions/mean_terminated_length": 1202.4705810546875, |
| "completions/min_length": 605.0, |
| "completions/min_terminated_length": 605.0, |
| "epoch": 0.016, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2076576203107834, |
| "learning_rate": 6.5e-07, |
| "loss": 0.0261, |
| "num_tokens": 1780559.0, |
| "reward": 0.005522748455405235, |
| "reward_std": 0.7086418867111206, |
| "rewards/cosine_scaled_reward/mean": -0.1378636360168457, |
| "rewards/cosine_scaled_reward/std": 0.35400503873825073, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 14 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1328.0, |
| "completions/mean_length": 1698.171875, |
| "completions/mean_terminated_length": 731.0, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.017142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1969502866268158, |
| "learning_rate": 7e-07, |
| "loss": 0.0216, |
| "num_tokens": 1900162.0, |
| "reward": 0.2789269685745239, |
| "reward_std": 0.43547046184539795, |
| "rewards/cosine_scaled_reward/mean": -0.00897398591041565, |
| "rewards/cosine_scaled_reward/std": 0.4515364170074463, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 15 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 1.0, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 0.0, |
| "completions/mean_length": 2048.0, |
| "completions/mean_terminated_length": 0.0, |
| "completions/min_length": 2048.0, |
| "completions/min_terminated_length": 0.0, |
| "epoch": 0.018285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23300249874591827, |
| "learning_rate": 7.5e-07, |
| "loss": -0.0, |
| "num_tokens": 2041674.0, |
| "reward": -0.5078557729721069, |
| "reward_std": 0.3458974361419678, |
| "rewards/cosine_scaled_reward/mean": -0.25392788648605347, |
| "rewards/cosine_scaled_reward/std": 0.18378609418869019, |
| "rewards/format_reward/mean": 0.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 16 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1563.734375, |
| "completions/mean_terminated_length": 941.107177734375, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.019428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20892462134361267, |
| "learning_rate": 8e-07, |
| "loss": 0.0477, |
| "num_tokens": 2152273.0, |
| "reward": 0.3328002989292145, |
| "reward_std": 0.7669951319694519, |
| "rewards/cosine_scaled_reward/mean": -0.06797486543655396, |
| "rewards/cosine_scaled_reward/std": 0.4412795305252075, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 17 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1465.0, |
| "completions/mean_length": 1778.90625, |
| "completions/mean_terminated_length": 899.86669921875, |
| "completions/min_length": 535.0, |
| "completions/min_terminated_length": 535.0, |
| "epoch": 0.02057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19322611391544342, |
| "learning_rate": 8.499999999999999e-07, |
| "loss": 0.0726, |
| "num_tokens": 2276499.0, |
| "reward": -0.18389344215393066, |
| "reward_std": 0.5934990644454956, |
| "rewards/cosine_scaled_reward/mean": -0.23257172107696533, |
| "rewards/cosine_scaled_reward/std": 0.256833553314209, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 18 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1771.0, |
| "completions/mean_length": 1869.53125, |
| "completions/mean_terminated_length": 1232.1429443359375, |
| "completions/min_length": 711.0, |
| "completions/min_terminated_length": 711.0, |
| "epoch": 0.021714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21417103707790375, |
| "learning_rate": 9e-07, |
| "loss": 0.0378, |
| "num_tokens": 2407405.0, |
| "reward": -0.05162222683429718, |
| "reward_std": 0.7635236978530884, |
| "rewards/cosine_scaled_reward/mean": -0.158623605966568, |
| "rewards/cosine_scaled_reward/std": 0.4003170132637024, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 19 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1836.0, |
| "completions/mean_length": 1572.90625, |
| "completions/mean_terminated_length": 878.5385131835938, |
| "completions/min_length": 369.0, |
| "completions/min_terminated_length": 369.0, |
| "epoch": 0.022857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1591554582118988, |
| "learning_rate": 9.499999999999999e-07, |
| "loss": 0.0507, |
| "num_tokens": 2519423.0, |
| "reward": 0.2816518545150757, |
| "reward_std": 0.7381908893585205, |
| "rewards/cosine_scaled_reward/mean": -0.07011157274246216, |
| "rewards/cosine_scaled_reward/std": 0.35477158427238464, |
| "rewards/format_reward/mean": 0.421875, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 20 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1776.28125, |
| "completions/mean_terminated_length": 1081.888916015625, |
| "completions/min_length": 332.0, |
| "completions/min_terminated_length": 332.0, |
| "epoch": 0.024, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22487252950668335, |
| "learning_rate": 1e-06, |
| "loss": 0.0137, |
| "num_tokens": 2643913.0, |
| "reward": -0.0122755765914917, |
| "reward_std": 0.4569401443004608, |
| "rewards/cosine_scaled_reward/mean": -0.16238778829574585, |
| "rewards/cosine_scaled_reward/std": 0.3900769054889679, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 21 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1851.0, |
| "completions/mean_length": 1273.1875, |
| "completions/mean_terminated_length": 776.5128173828125, |
| "completions/min_length": 242.0, |
| "completions/min_terminated_length": 242.0, |
| "epoch": 0.025142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1901247799396515, |
| "learning_rate": 9.99931462820376e-07, |
| "loss": -0.0442, |
| "num_tokens": 2734413.0, |
| "reward": 0.5235691666603088, |
| "reward_std": 0.4210290312767029, |
| "rewards/cosine_scaled_reward/mean": -0.07415291666984558, |
| "rewards/cosine_scaled_reward/std": 0.40765848755836487, |
| "rewards/format_reward/mean": 0.671875, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 22 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1954.0, |
| "completions/mean_length": 1640.84375, |
| "completions/mean_terminated_length": 1082.888916015625, |
| "completions/min_length": 363.0, |
| "completions/min_terminated_length": 363.0, |
| "epoch": 0.026285714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21930935978889465, |
| "learning_rate": 9.997258721585931e-07, |
| "loss": 0.0518, |
| "num_tokens": 2850219.0, |
| "reward": 0.23656107485294342, |
| "reward_std": 0.6851356029510498, |
| "rewards/cosine_scaled_reward/mean": -0.10046947002410889, |
| "rewards/cosine_scaled_reward/std": 0.45323267579078674, |
| "rewards/format_reward/mean": 0.4375, |
| "rewards/format_reward/std": 0.5, |
| "step": 23 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1985.0, |
| "completions/mean_length": 1785.265625, |
| "completions/mean_terminated_length": 1113.8333740234375, |
| "completions/min_length": 475.0, |
| "completions/min_terminated_length": 475.0, |
| "epoch": 0.027428571428571427, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.196747824549675, |
| "learning_rate": 9.993832906395582e-07, |
| "loss": 0.0687, |
| "num_tokens": 2975404.0, |
| "reward": 0.04860962927341461, |
| "reward_std": 0.8576602935791016, |
| "rewards/cosine_scaled_reward/mean": -0.1475701779127121, |
| "rewards/cosine_scaled_reward/std": 0.4082482159137726, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 24 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1752.0, |
| "completions/mean_length": 1695.234375, |
| "completions/mean_terminated_length": 919.1500244140625, |
| "completions/min_length": 502.0, |
| "completions/min_terminated_length": 502.0, |
| "epoch": 0.02857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22251193225383759, |
| "learning_rate": 9.989038226169207e-07, |
| "loss": 0.0401, |
| "num_tokens": 3094195.0, |
| "reward": 0.2244701385498047, |
| "reward_std": 0.6461865901947021, |
| "rewards/cosine_scaled_reward/mean": -0.06745242327451706, |
| "rewards/cosine_scaled_reward/std": 0.41534900665283203, |
| "rewards/format_reward/mean": 0.359375, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 25 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1974.4375, |
| "completions/mean_terminated_length": 1524.888916015625, |
| "completions/min_length": 1105.0, |
| "completions/min_terminated_length": 1105.0, |
| "epoch": 0.029714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23350541293621063, |
| "learning_rate": 9.982876141412855e-07, |
| "loss": 0.0101, |
| "num_tokens": 3231191.0, |
| "reward": 0.16762161254882812, |
| "reward_std": 0.5227605104446411, |
| "rewards/cosine_scaled_reward/mean": -0.041189197450876236, |
| "rewards/cosine_scaled_reward/std": 0.37332749366760254, |
| "rewards/format_reward/mean": 0.25, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 26 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.828125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1999.0, |
| "completions/mean_length": 1915.5, |
| "completions/mean_terminated_length": 1277.0909423828125, |
| "completions/min_length": 554.0, |
| "completions/min_terminated_length": 554.0, |
| "epoch": 0.030857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21174418926239014, |
| "learning_rate": 9.975348529157229e-07, |
| "loss": 0.0312, |
| "num_tokens": 3364071.0, |
| "reward": -0.18293717503547668, |
| "reward_std": 0.5386844873428345, |
| "rewards/cosine_scaled_reward/mean": -0.20865610241889954, |
| "rewards/cosine_scaled_reward/std": 0.2562413811683655, |
| "rewards/format_reward/mean": 0.234375, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 27 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2007.0, |
| "completions/mean_length": 1815.140625, |
| "completions/mean_terminated_length": 1220.0555419921875, |
| "completions/min_length": 445.0, |
| "completions/min_terminated_length": 445.0, |
| "epoch": 0.032, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.213092640042305, |
| "learning_rate": 9.96645768238595e-07, |
| "loss": 0.0361, |
| "num_tokens": 3490576.0, |
| "reward": 0.04266031086444855, |
| "reward_std": 0.776748776435852, |
| "rewards/cosine_scaled_reward/mean": -0.13491985201835632, |
| "rewards/cosine_scaled_reward/std": 0.37269750237464905, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 28 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2033.0, |
| "completions/mean_length": 1906.15625, |
| "completions/mean_terminated_length": 1039.3333740234375, |
| "completions/min_length": 633.0, |
| "completions/min_terminated_length": 633.0, |
| "epoch": 0.03314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22322852909564972, |
| "learning_rate": 9.956206309337066e-07, |
| "loss": 0.0389, |
| "num_tokens": 3623042.0, |
| "reward": -0.1004815474152565, |
| "reward_std": 0.539789080619812, |
| "rewards/cosine_scaled_reward/mean": -0.12836576998233795, |
| "rewards/cosine_scaled_reward/std": 0.28681084513664246, |
| "rewards/format_reward/mean": 0.15625, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 29 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1854.1875, |
| "completions/mean_terminated_length": 1162.0, |
| "completions/min_length": 592.0, |
| "completions/min_terminated_length": 592.0, |
| "epoch": 0.03428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2063974291086197, |
| "learning_rate": 9.944597532678119e-07, |
| "loss": 0.0115, |
| "num_tokens": 3752246.0, |
| "reward": -0.030107807368040085, |
| "reward_std": 0.6322507858276367, |
| "rewards/cosine_scaled_reward/mean": -0.1634913980960846, |
| "rewards/cosine_scaled_reward/std": 0.31110286712646484, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 30 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1523.0, |
| "completions/mean_length": 1841.15625, |
| "completions/mean_terminated_length": 724.2000122070312, |
| "completions/min_length": 165.0, |
| "completions/min_terminated_length": 165.0, |
| "epoch": 0.03542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2025275081396103, |
| "learning_rate": 9.931634888554935e-07, |
| "loss": 0.0143, |
| "num_tokens": 3880576.0, |
| "reward": -0.34719598293304443, |
| "reward_std": 0.5259275436401367, |
| "rewards/cosine_scaled_reward/mean": -0.2595354914665222, |
| "rewards/cosine_scaled_reward/std": 0.24079306423664093, |
| "rewards/format_reward/mean": 0.171875, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 31 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1998.0, |
| "completions/mean_length": 1945.65625, |
| "completions/mean_terminated_length": 1393.0, |
| "completions/min_length": 899.0, |
| "completions/min_terminated_length": 899.0, |
| "epoch": 0.036571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22421319782733917, |
| "learning_rate": 9.917322325514487e-07, |
| "loss": 0.0542, |
| "num_tokens": 4015450.0, |
| "reward": -0.2238868921995163, |
| "reward_std": 0.6127103567123413, |
| "rewards/cosine_scaled_reward/mean": -0.20569345355033875, |
| "rewards/cosine_scaled_reward/std": 0.26141345500946045, |
| "rewards/format_reward/mean": 0.1875, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 32 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1959.0, |
| "completions/mean_length": 1976.890625, |
| "completions/mean_terminated_length": 1289.5, |
| "completions/min_length": 581.0, |
| "completions/min_terminated_length": 581.0, |
| "epoch": 0.037714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2219865769147873, |
| "learning_rate": 9.901664203302124e-07, |
| "loss": 0.0139, |
| "num_tokens": 4153187.0, |
| "reward": -0.5050230026245117, |
| "reward_std": 0.38754361867904663, |
| "rewards/cosine_scaled_reward/mean": -0.31501150131225586, |
| "rewards/cosine_scaled_reward/std": 0.19765734672546387, |
| "rewards/format_reward/mean": 0.125, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 33 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1573.625, |
| "completions/mean_terminated_length": 1068.6451416015625, |
| "completions/min_length": 517.0, |
| "completions/min_terminated_length": 517.0, |
| "epoch": 0.038857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23140408098697662, |
| "learning_rate": 9.88466529153356e-07, |
| "loss": 0.0697, |
| "num_tokens": 4263451.0, |
| "reward": 0.3802332282066345, |
| "reward_std": 0.8625352382659912, |
| "rewards/cosine_scaled_reward/mean": -0.05207090824842453, |
| "rewards/cosine_scaled_reward/std": 0.4423771798610687, |
| "rewards/format_reward/mean": 0.484375, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 34 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1957.0, |
| "completions/mean_length": 1814.875, |
| "completions/mean_terminated_length": 1053.3333740234375, |
| "completions/min_length": 359.0, |
| "completions/min_terminated_length": 359.0, |
| "epoch": 0.04, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21747314929962158, |
| "learning_rate": 9.866330768241983e-07, |
| "loss": 0.0288, |
| "num_tokens": 4391099.0, |
| "reward": 0.11022068560123444, |
| "reward_std": 0.898347795009613, |
| "rewards/cosine_scaled_reward/mean": -0.08551465719938278, |
| "rewards/cosine_scaled_reward/std": 0.4119128882884979, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 35 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.90625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1775.0, |
| "completions/mean_length": 1976.765625, |
| "completions/mean_terminated_length": 1288.166748046875, |
| "completions/min_length": 964.0, |
| "completions/min_terminated_length": 964.0, |
| "epoch": 0.04114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23834578692913055, |
| "learning_rate": 9.846666218300807e-07, |
| "loss": 0.021, |
| "num_tokens": 4528724.0, |
| "reward": -0.38736510276794434, |
| "reward_std": 0.5356569290161133, |
| "rewards/cosine_scaled_reward/mean": -0.24837006628513336, |
| "rewards/cosine_scaled_reward/std": 0.23275430500507355, |
| "rewards/format_reward/mean": 0.109375, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 36 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.9375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2047.0, |
| "completions/mean_length": 2013.9375, |
| "completions/mean_terminated_length": 1503.0, |
| "completions/min_length": 1027.0, |
| "completions/min_terminated_length": 1027.0, |
| "epoch": 0.04228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22654284536838531, |
| "learning_rate": 9.825677631722435e-07, |
| "loss": 0.0142, |
| "num_tokens": 4668640.0, |
| "reward": -0.42377781867980957, |
| "reward_std": 0.379480242729187, |
| "rewards/cosine_scaled_reward/mean": -0.2665764391422272, |
| "rewards/cosine_scaled_reward/std": 0.18001720309257507, |
| "rewards/format_reward/mean": 0.109375, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 37 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1254.0, |
| "completions/mean_length": 1912.265625, |
| "completions/mean_terminated_length": 1082.77783203125, |
| "completions/min_length": 920.0, |
| "completions/min_terminated_length": 920.0, |
| "epoch": 0.04342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22495149075984955, |
| "learning_rate": 9.80337140183366e-07, |
| "loss": 0.0353, |
| "num_tokens": 4802737.0, |
| "reward": -0.15185467898845673, |
| "reward_std": 0.38927191495895386, |
| "rewards/cosine_scaled_reward/mean": -0.14623984694480896, |
| "rewards/cosine_scaled_reward/std": 0.32866883277893066, |
| "rewards/format_reward/mean": 0.140625, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 38 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1979.0, |
| "completions/mean_length": 1678.4375, |
| "completions/mean_terminated_length": 656.7058715820312, |
| "completions/min_length": 300.0, |
| "completions/min_terminated_length": 300.0, |
| "epoch": 0.044571428571428574, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19435559213161469, |
| "learning_rate": 9.779754323328192e-07, |
| "loss": 0.0416, |
| "num_tokens": 4920941.0, |
| "reward": 0.17510981857776642, |
| "reward_std": 0.559760570526123, |
| "rewards/cosine_scaled_reward/mean": -0.0765075832605362, |
| "rewards/cosine_scaled_reward/std": 0.3369429409503937, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 39 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1762.0, |
| "completions/mean_length": 1682.25, |
| "completions/mean_terminated_length": 877.6000366210938, |
| "completions/min_length": 465.0, |
| "completions/min_terminated_length": 465.0, |
| "epoch": 0.045714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19440439343452454, |
| "learning_rate": 9.754833590196926e-07, |
| "loss": 0.0685, |
| "num_tokens": 5038677.0, |
| "reward": 0.09382888674736023, |
| "reward_std": 0.4140171706676483, |
| "rewards/cosine_scaled_reward/mean": -0.12496057152748108, |
| "rewards/cosine_scaled_reward/std": 0.3649806082248688, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 40 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1905.0, |
| "completions/mean_length": 1841.0625, |
| "completions/mean_terminated_length": 1417.3333740234375, |
| "completions/min_length": 965.0, |
| "completions/min_terminated_length": 965.0, |
| "epoch": 0.046857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21685408055782318, |
| "learning_rate": 9.728616793536587e-07, |
| "loss": 0.0105, |
| "num_tokens": 5167657.0, |
| "reward": -0.15476089715957642, |
| "reward_std": 0.5854519605636597, |
| "rewards/cosine_scaled_reward/mean": -0.2648804187774658, |
| "rewards/cosine_scaled_reward/std": 0.26939424872398376, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 41 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1695.0, |
| "completions/mean_terminated_length": 719.058837890625, |
| "completions/min_length": 205.0, |
| "completions/min_terminated_length": 205.0, |
| "epoch": 0.048, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.23794473707675934, |
| "learning_rate": 9.701111919237408e-07, |
| "loss": 0.0327, |
| "num_tokens": 5286497.0, |
| "reward": -0.2923233211040497, |
| "reward_std": 0.36149862408638, |
| "rewards/cosine_scaled_reward/mean": -0.27897417545318604, |
| "rewards/cosine_scaled_reward/std": 0.17192503809928894, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 42 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1793.84375, |
| "completions/mean_terminated_length": 1031.375, |
| "completions/min_length": 714.0, |
| "completions/min_terminated_length": 714.0, |
| "epoch": 0.04914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21354877948760986, |
| "learning_rate": 9.672327345550543e-07, |
| "loss": 0.0597, |
| "num_tokens": 5412919.0, |
| "reward": -0.0004070103168487549, |
| "reward_std": 0.5297929048538208, |
| "rewards/cosine_scaled_reward/mean": -0.12520350515842438, |
| "rewards/cosine_scaled_reward/std": 0.3128352463245392, |
| "rewards/format_reward/mean": 0.25, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 43 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1729.0, |
| "completions/mean_length": 1651.1875, |
| "completions/mean_terminated_length": 778.2000122070312, |
| "completions/min_length": 251.0, |
| "completions/min_terminated_length": 251.0, |
| "epoch": 0.05028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20100052654743195, |
| "learning_rate": 9.64227184053598e-07, |
| "loss": 0.0089, |
| "num_tokens": 5529291.0, |
| "reward": 0.13101597130298615, |
| "reward_std": 0.5976744890213013, |
| "rewards/cosine_scaled_reward/mean": -0.09855452179908752, |
| "rewards/cosine_scaled_reward/std": 0.46046286821365356, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 44 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.921875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1953.0, |
| "completions/mean_length": 2011.765625, |
| "completions/mean_terminated_length": 1584.2000732421875, |
| "completions/min_length": 1146.0, |
| "completions/min_terminated_length": 1146.0, |
| "epoch": 0.05142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2144525796175003, |
| "learning_rate": 9.610954559391704e-07, |
| "loss": 0.0108, |
| "num_tokens": 5669700.0, |
| "reward": -0.15992262959480286, |
| "reward_std": 0.5183610916137695, |
| "rewards/cosine_scaled_reward/mean": -0.14246131479740143, |
| "rewards/cosine_scaled_reward/std": 0.37169432640075684, |
| "rewards/format_reward/mean": 0.125, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 45 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1566.0, |
| "completions/mean_length": 1811.21875, |
| "completions/mean_terminated_length": 965.5714721679688, |
| "completions/min_length": 578.0, |
| "completions/min_terminated_length": 578.0, |
| "epoch": 0.052571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2243409901857376, |
| "learning_rate": 9.578385041664925e-07, |
| "loss": 0.0324, |
| "num_tokens": 5796786.0, |
| "reward": -0.2682954668998718, |
| "reward_std": 0.47855472564697266, |
| "rewards/cosine_scaled_reward/mean": -0.2435227334499359, |
| "rewards/cosine_scaled_reward/std": 0.21708372235298157, |
| "rewards/format_reward/mean": 0.21875, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 46 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.71875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1730.0, |
| "completions/mean_length": 1720.15625, |
| "completions/mean_terminated_length": 882.3333129882812, |
| "completions/min_length": 432.0, |
| "completions/min_terminated_length": 432.0, |
| "epoch": 0.053714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1925242692232132, |
| "learning_rate": 9.54457320834625e-07, |
| "loss": 0.0641, |
| "num_tokens": 5917276.0, |
| "reward": -0.03124237060546875, |
| "reward_std": 0.6693180203437805, |
| "rewards/cosine_scaled_reward/mean": -0.17968368530273438, |
| "rewards/cosine_scaled_reward/std": 0.379862517118454, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 47 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1723.734375, |
| "completions/mean_terminated_length": 1059.761962890625, |
| "completions/min_length": 617.0, |
| "completions/min_terminated_length": 617.0, |
| "epoch": 0.054857142857142854, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20502391457557678, |
| "learning_rate": 9.509529358847654e-07, |
| "loss": 0.0544, |
| "num_tokens": 6038139.0, |
| "reward": 0.21815460920333862, |
| "reward_std": 0.6701791286468506, |
| "rewards/cosine_scaled_reward/mean": -0.05498518794775009, |
| "rewards/cosine_scaled_reward/std": 0.42852458357810974, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 48 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1571.0, |
| "completions/mean_length": 1450.3125, |
| "completions/mean_terminated_length": 814.0645141601562, |
| "completions/min_length": 275.0, |
| "completions/min_terminated_length": 275.0, |
| "epoch": 0.056, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18392837047576904, |
| "learning_rate": 9.473264167865171e-07, |
| "loss": 0.0771, |
| "num_tokens": 6141023.0, |
| "reward": 0.20156216621398926, |
| "reward_std": 0.7049944400787354, |
| "rewards/cosine_scaled_reward/mean": -0.14921891689300537, |
| "rewards/cosine_scaled_reward/std": 0.35212206840515137, |
| "rewards/format_reward/mean": 0.5, |
| "rewards/format_reward/std": 0.5039526224136353, |
| "step": 49 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.78125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1605.0, |
| "completions/mean_length": 1740.84375, |
| "completions/mean_terminated_length": 643.857177734375, |
| "completions/min_length": 307.0, |
| "completions/min_terminated_length": 307.0, |
| "epoch": 0.05714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1936425119638443, |
| "learning_rate": 9.43578868212728e-07, |
| "loss": 0.0292, |
| "num_tokens": 6263253.0, |
| "reward": -0.08827750384807587, |
| "reward_std": 0.3788633346557617, |
| "rewards/cosine_scaled_reward/mean": -0.16132624447345734, |
| "rewards/cosine_scaled_reward/std": 0.36572694778442383, |
| "rewards/format_reward/mean": 0.234375, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 50 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1543.0, |
| "completions/mean_length": 1443.46875, |
| "completions/mean_terminated_length": 758.3333740234375, |
| "completions/min_length": 398.0, |
| "completions/min_terminated_length": 398.0, |
| "epoch": 0.05828571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19979657232761383, |
| "learning_rate": 9.397114317029974e-07, |
| "loss": 0.0422, |
| "num_tokens": 6365843.0, |
| "reward": 0.09200635552406311, |
| "reward_std": 0.5713317394256592, |
| "rewards/cosine_scaled_reward/mean": -0.18837183713912964, |
| "rewards/cosine_scaled_reward/std": 0.28224873542785645, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 51 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.734375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1883.0, |
| "completions/mean_length": 1713.5, |
| "completions/mean_terminated_length": 788.7058715820312, |
| "completions/min_length": 282.0, |
| "completions/min_terminated_length": 282.0, |
| "epoch": 0.05942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2033102959394455, |
| "learning_rate": 9.357252853159505e-07, |
| "loss": 0.0054, |
| "num_tokens": 6486859.0, |
| "reward": -0.013391643762588501, |
| "reward_std": 0.41247767210006714, |
| "rewards/cosine_scaled_reward/mean": -0.14732082188129425, |
| "rewards/cosine_scaled_reward/std": 0.3900400698184967, |
| "rewards/format_reward/mean": 0.28125, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 52 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.765625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1233.0, |
| "completions/mean_length": 1724.828125, |
| "completions/mean_terminated_length": 669.1333618164062, |
| "completions/min_length": 400.0, |
| "completions/min_terminated_length": 400.0, |
| "epoch": 0.060571428571428575, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20569661259651184, |
| "learning_rate": 9.316216432703916e-07, |
| "loss": 0.0366, |
| "num_tokens": 6608080.0, |
| "reward": -0.0075955986976623535, |
| "reward_std": 0.6965757012367249, |
| "rewards/cosine_scaled_reward/mean": -0.15223531424999237, |
| "rewards/cosine_scaled_reward/std": 0.32304710149765015, |
| "rewards/format_reward/mean": 0.296875, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 53 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1923.0, |
| "completions/mean_length": 1761.859375, |
| "completions/mean_terminated_length": 1132.3499755859375, |
| "completions/min_length": 602.0, |
| "completions/min_terminated_length": 602.0, |
| "epoch": 0.061714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19333088397979736, |
| "learning_rate": 9.274017555754407e-07, |
| "loss": 0.0673, |
| "num_tokens": 6731983.0, |
| "reward": 0.35599154233932495, |
| "reward_std": 1.0488793849945068, |
| "rewards/cosine_scaled_reward/mean": -0.05637925863265991, |
| "rewards/cosine_scaled_reward/std": 0.46367430686950684, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 54 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1820.0, |
| "completions/mean_length": 1711.0, |
| "completions/mean_terminated_length": 969.6000366210938, |
| "completions/min_length": 568.0, |
| "completions/min_terminated_length": 568.0, |
| "epoch": 0.06285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17474976181983948, |
| "learning_rate": 9.230669076497687e-07, |
| "loss": 0.0316, |
| "num_tokens": 6851775.0, |
| "reward": 0.20368073880672455, |
| "reward_std": 0.4746112525463104, |
| "rewards/cosine_scaled_reward/mean": -0.05440961569547653, |
| "rewards/cosine_scaled_reward/std": 0.4434376358985901, |
| "rewards/format_reward/mean": 0.3125, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 55 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1844.0, |
| "completions/mean_length": 1733.625, |
| "completions/mean_terminated_length": 1042.0, |
| "completions/min_length": 534.0, |
| "completions/min_terminated_length": 534.0, |
| "epoch": 0.064, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19868570566177368, |
| "learning_rate": 9.186184199300463e-07, |
| "loss": 0.0503, |
| "num_tokens": 6973687.0, |
| "reward": 0.19238728284835815, |
| "reward_std": 0.5642611980438232, |
| "rewards/cosine_scaled_reward/mean": -0.10693138092756271, |
| "rewards/cosine_scaled_reward/std": 0.48442336916923523, |
| "rewards/format_reward/mean": 0.40625, |
| "rewards/format_reward/std": 0.49501484632492065, |
| "step": 56 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1998.0, |
| "completions/mean_length": 1957.140625, |
| "completions/mean_terminated_length": 1401.888916015625, |
| "completions/min_length": 919.0, |
| "completions/min_terminated_length": 919.0, |
| "epoch": 0.06514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1979535073041916, |
| "learning_rate": 9.140576474687263e-07, |
| "loss": 0.0417, |
| "num_tokens": 7110512.0, |
| "reward": -0.4044339656829834, |
| "reward_std": 0.4291505217552185, |
| "rewards/cosine_scaled_reward/mean": -0.3037794828414917, |
| "rewards/cosine_scaled_reward/std": 0.17916814982891083, |
| "rewards/format_reward/mean": 0.203125, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 57 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2003.0, |
| "completions/mean_length": 1508.109375, |
| "completions/mean_terminated_length": 968.21875, |
| "completions/min_length": 367.0, |
| "completions/min_terminated_length": 367.0, |
| "epoch": 0.06628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17342056334018707, |
| "learning_rate": 9.093859795212817e-07, |
| "loss": 0.0248, |
| "num_tokens": 7217127.0, |
| "reward": 0.6014055013656616, |
| "reward_std": 0.8353673219680786, |
| "rewards/cosine_scaled_reward/mean": -0.019609764218330383, |
| "rewards/cosine_scaled_reward/std": 0.4545621871948242, |
| "rewards/format_reward/mean": 0.640625, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 58 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1806.0, |
| "completions/mean_length": 1733.078125, |
| "completions/mean_terminated_length": 1088.2381591796875, |
| "completions/min_length": 541.0, |
| "completions/min_terminated_length": 541.0, |
| "epoch": 0.06742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18201254308223724, |
| "learning_rate": 9.046048391230247e-07, |
| "loss": 0.0086, |
| "num_tokens": 7338508.0, |
| "reward": 0.13016025722026825, |
| "reward_std": 0.5339452624320984, |
| "rewards/cosine_scaled_reward/mean": -0.11460737138986588, |
| "rewards/cosine_scaled_reward/std": 0.40606313943862915, |
| "rewards/format_reward/mean": 0.359375, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 59 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.75, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1347.0, |
| "completions/mean_length": 1773.171875, |
| "completions/mean_terminated_length": 948.6875, |
| "completions/min_length": 635.0, |
| "completions/min_terminated_length": 635.0, |
| "epoch": 0.06857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17932923138141632, |
| "learning_rate": 8.997156826556369e-07, |
| "loss": 0.0516, |
| "num_tokens": 7462879.0, |
| "reward": -0.2482871562242508, |
| "reward_std": 0.4085908830165863, |
| "rewards/cosine_scaled_reward/mean": -0.2569561004638672, |
| "rewards/cosine_scaled_reward/std": 0.2272651493549347, |
| "rewards/format_reward/mean": 0.265625, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 60 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1726.890625, |
| "completions/mean_terminated_length": 966.368408203125, |
| "completions/min_length": 470.0, |
| "completions/min_terminated_length": 470.0, |
| "epoch": 0.06971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20350487530231476, |
| "learning_rate": 8.9471999940354e-07, |
| "loss": 0.0252, |
| "num_tokens": 7584920.0, |
| "reward": 0.2052871733903885, |
| "reward_std": 0.7419347763061523, |
| "rewards/cosine_scaled_reward/mean": -0.10829392075538635, |
| "rewards/cosine_scaled_reward/std": 0.31667017936706543, |
| "rewards/format_reward/mean": 0.421875, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 61 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1668.0, |
| "completions/mean_length": 1549.28125, |
| "completions/mean_terminated_length": 865.8518676757812, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.07085714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17975586652755737, |
| "learning_rate": 8.896193111002475e-07, |
| "loss": 0.049, |
| "num_tokens": 7694306.0, |
| "reward": 0.3729054629802704, |
| "reward_std": 0.6217197775840759, |
| "rewards/cosine_scaled_reward/mean": -0.055734772235155106, |
| "rewards/cosine_scaled_reward/std": 0.3869990408420563, |
| "rewards/format_reward/mean": 0.484375, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 62 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1891.0, |
| "completions/mean_length": 1506.75, |
| "completions/mean_terminated_length": 1136.4210205078125, |
| "completions/min_length": 479.0, |
| "completions/min_terminated_length": 479.0, |
| "epoch": 0.072, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19678133726119995, |
| "learning_rate": 8.844151714648274e-07, |
| "loss": 0.0991, |
| "num_tokens": 7800994.0, |
| "reward": 0.5956183671951294, |
| "reward_std": 0.7010378837585449, |
| "rewards/cosine_scaled_reward/mean": -0.0068783238530159, |
| "rewards/cosine_scaled_reward/std": 0.4637373983860016, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 63 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1813.0, |
| "completions/mean_length": 1494.484375, |
| "completions/mean_terminated_length": 974.5151977539062, |
| "completions/min_length": 389.0, |
| "completions/min_terminated_length": 389.0, |
| "epoch": 0.07314285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19099701941013336, |
| "learning_rate": 8.791091657286267e-07, |
| "loss": 0.0732, |
| "num_tokens": 7907593.0, |
| "reward": 0.28888779878616333, |
| "reward_std": 0.6505820751190186, |
| "rewards/cosine_scaled_reward/mean": -0.13680610060691833, |
| "rewards/cosine_scaled_reward/std": 0.36594465374946594, |
| "rewards/format_reward/mean": 0.5625, |
| "rewards/format_reward/std": 0.5, |
| "step": 64 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.6875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1689.0, |
| "completions/mean_length": 1618.453125, |
| "completions/mean_terminated_length": 673.4500122070312, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.07428571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18873563408851624, |
| "learning_rate": 8.737029101523929e-07, |
| "loss": 0.0291, |
| "num_tokens": 8021878.0, |
| "reward": -0.08997441828250885, |
| "reward_std": 0.3741680085659027, |
| "rewards/cosine_scaled_reward/mean": -0.20904971659183502, |
| "rewards/cosine_scaled_reward/std": 0.35118550062179565, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 65 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1513.0, |
| "completions/mean_length": 1217.859375, |
| "completions/mean_terminated_length": 719.7750244140625, |
| "completions/min_length": 226.0, |
| "completions/min_terminated_length": 226.0, |
| "epoch": 0.07542857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15865544974803925, |
| "learning_rate": 8.681980515339463e-07, |
| "loss": 0.0289, |
| "num_tokens": 8110053.0, |
| "reward": 0.5319543480873108, |
| "reward_std": 0.7594929337501526, |
| "rewards/cosine_scaled_reward/mean": -0.046522848308086395, |
| "rewards/cosine_scaled_reward/std": 0.47548800706863403, |
| "rewards/format_reward/mean": 0.625, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 66 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.859375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1708.0, |
| "completions/mean_length": 1944.203125, |
| "completions/mean_terminated_length": 1309.888916015625, |
| "completions/min_length": 827.0, |
| "completions/min_terminated_length": 827.0, |
| "epoch": 0.07657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1986117660999298, |
| "learning_rate": 8.625962667065487e-07, |
| "loss": 0.0261, |
| "num_tokens": 8245218.0, |
| "reward": -0.3390616178512573, |
| "reward_std": 0.38135582208633423, |
| "rewards/cosine_scaled_reward/mean": -0.25546830892562866, |
| "rewards/cosine_scaled_reward/std": 0.1776033639907837, |
| "rewards/format_reward/mean": 0.171875, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 67 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1960.0, |
| "completions/mean_length": 997.09375, |
| "completions/mean_terminated_length": 675.3877563476562, |
| "completions/min_length": 269.0, |
| "completions/min_terminated_length": 269.0, |
| "epoch": 0.07771428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13545449078083038, |
| "learning_rate": 8.568992620281243e-07, |
| "loss": 0.0228, |
| "num_tokens": 8318288.0, |
| "reward": 0.7294949293136597, |
| "reward_std": 0.5788470506668091, |
| "rewards/cosine_scaled_reward/mean": -0.018065020442008972, |
| "rewards/cosine_scaled_reward/std": 0.42799946665763855, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 68 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1640.0, |
| "completions/mean_length": 1297.90625, |
| "completions/mean_terminated_length": 817.076904296875, |
| "completions/min_length": 375.0, |
| "completions/min_terminated_length": 375.0, |
| "epoch": 0.07885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18707174062728882, |
| "learning_rate": 8.511087728614862e-07, |
| "loss": 0.0366, |
| "num_tokens": 8411698.0, |
| "reward": 0.20155681669712067, |
| "reward_std": 0.5115354061126709, |
| "rewards/cosine_scaled_reward/mean": -0.21172159910202026, |
| "rewards/cosine_scaled_reward/std": 0.30984631180763245, |
| "rewards/format_reward/mean": 0.625, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 69 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.8125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1986.0, |
| "completions/mean_length": 1842.484375, |
| "completions/mean_terminated_length": 951.9166870117188, |
| "completions/min_length": 444.0, |
| "completions/min_terminated_length": 444.0, |
| "epoch": 0.08, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1955178827047348, |
| "learning_rate": 8.452265630457282e-07, |
| "loss": 0.0137, |
| "num_tokens": 8541073.0, |
| "reward": -0.1862781047821045, |
| "reward_std": 0.5197064876556396, |
| "rewards/cosine_scaled_reward/mean": -0.21032655239105225, |
| "rewards/cosine_scaled_reward/std": 0.26505687832832336, |
| "rewards/format_reward/mean": 0.234375, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 70 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1884.0, |
| "completions/mean_length": 1553.671875, |
| "completions/mean_terminated_length": 831.1923217773438, |
| "completions/min_length": 356.0, |
| "completions/min_terminated_length": 356.0, |
| "epoch": 0.08114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19525648653507233, |
| "learning_rate": 8.392544243589427e-07, |
| "loss": 0.0346, |
| "num_tokens": 8651228.0, |
| "reward": 0.3039510250091553, |
| "reward_std": 0.7005565762519836, |
| "rewards/cosine_scaled_reward/mean": -0.05114949122071266, |
| "rewards/cosine_scaled_reward/std": 0.47836223244667053, |
| "rewards/format_reward/mean": 0.40625, |
| "rewards/format_reward/std": 0.49501484632492065, |
| "step": 71 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1733.0, |
| "completions/mean_length": 1406.8125, |
| "completions/mean_terminated_length": 841.058837890625, |
| "completions/min_length": 346.0, |
| "completions/min_terminated_length": 346.0, |
| "epoch": 0.08228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17535589635372162, |
| "learning_rate": 8.331941759724268e-07, |
| "loss": 0.0434, |
| "num_tokens": 8751616.0, |
| "reward": 0.21918153762817383, |
| "reward_std": 0.4695218801498413, |
| "rewards/cosine_scaled_reward/mean": -0.1716592162847519, |
| "rewards/cosine_scaled_reward/std": 0.21545428037643433, |
| "rewards/format_reward/mean": 0.5625, |
| "rewards/format_reward/std": 0.5, |
| "step": 72 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.84375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1867.0, |
| "completions/mean_length": 1944.015625, |
| "completions/mean_terminated_length": 1382.5, |
| "completions/min_length": 951.0, |
| "completions/min_terminated_length": 951.0, |
| "epoch": 0.08342857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.21378304064273834, |
| "learning_rate": 8.270476638965461e-07, |
| "loss": 0.0319, |
| "num_tokens": 8886761.0, |
| "reward": -0.37639105319976807, |
| "reward_std": 0.4715355932712555, |
| "rewards/cosine_scaled_reward/mean": -0.26632052659988403, |
| "rewards/cosine_scaled_reward/std": 0.23604609072208405, |
| "rewards/format_reward/mean": 0.15625, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 73 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.671875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1967.0, |
| "completions/mean_length": 1745.1875, |
| "completions/mean_terminated_length": 1125.142822265625, |
| "completions/min_length": 590.0, |
| "completions/min_terminated_length": 590.0, |
| "epoch": 0.08457142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17263740301132202, |
| "learning_rate": 8.208167604184217e-07, |
| "loss": 0.0465, |
| "num_tokens": 9008381.0, |
| "reward": 0.2182944416999817, |
| "reward_std": 0.5771346092224121, |
| "rewards/cosine_scaled_reward/mean": -0.06272779405117035, |
| "rewards/cosine_scaled_reward/std": 0.4549061059951782, |
| "rewards/format_reward/mean": 0.34375, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 74 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1717.484375, |
| "completions/mean_terminated_length": 1292.5357666015625, |
| "completions/min_length": 660.0, |
| "completions/min_terminated_length": 660.0, |
| "epoch": 0.08571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19337797164916992, |
| "learning_rate": 8.145033635316128e-07, |
| "loss": 0.0729, |
| "num_tokens": 9129260.0, |
| "reward": 0.32763606309890747, |
| "reward_std": 0.5694445371627808, |
| "rewards/cosine_scaled_reward/mean": -0.07055696845054626, |
| "rewards/cosine_scaled_reward/std": 0.48110467195510864, |
| "rewards/format_reward/mean": 0.46875, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 75 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1544.0, |
| "completions/mean_length": 1465.96875, |
| "completions/mean_terminated_length": 883.9375, |
| "completions/min_length": 391.0, |
| "completions/min_terminated_length": 391.0, |
| "epoch": 0.08685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18641816079616547, |
| "learning_rate": 8.081093963579707e-07, |
| "loss": 0.018, |
| "num_tokens": 9233482.0, |
| "reward": -0.015750765800476074, |
| "reward_std": 0.4846976697444916, |
| "rewards/cosine_scaled_reward/mean": -0.26568788290023804, |
| "rewards/cosine_scaled_reward/std": 0.22177822887897491, |
| "rewards/format_reward/mean": 0.515625, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 76 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1894.0, |
| "completions/mean_length": 1673.03125, |
| "completions/mean_terminated_length": 1004.6087036132812, |
| "completions/min_length": 574.0, |
| "completions/min_terminated_length": 574.0, |
| "epoch": 0.088, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1897410899400711, |
| "learning_rate": 8.01636806561836e-07, |
| "loss": 0.0314, |
| "num_tokens": 9352132.0, |
| "reward": 0.2287338674068451, |
| "reward_std": 0.3902437686920166, |
| "rewards/cosine_scaled_reward/mean": -0.06532055139541626, |
| "rewards/cosine_scaled_reward/std": 0.3456854224205017, |
| "rewards/format_reward/mean": 0.359375, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 77 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.640625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1847.0, |
| "completions/mean_length": 1693.9375, |
| "completions/mean_terminated_length": 1062.7825927734375, |
| "completions/min_length": 541.0, |
| "completions/min_terminated_length": 541.0, |
| "epoch": 0.08914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19339214265346527, |
| "learning_rate": 7.950875657567621e-07, |
| "loss": 0.0274, |
| "num_tokens": 9471264.0, |
| "reward": 0.21436695754528046, |
| "reward_std": 0.47337740659713745, |
| "rewards/cosine_scaled_reward/mean": -0.08031650632619858, |
| "rewards/cosine_scaled_reward/std": 0.45170801877975464, |
| "rewards/format_reward/mean": 0.375, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 78 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1369.875, |
| "completions/mean_terminated_length": 875.0270385742188, |
| "completions/min_length": 216.0, |
| "completions/min_terminated_length": 216.0, |
| "epoch": 0.09028571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16123633086681366, |
| "learning_rate": 7.884636689049422e-07, |
| "loss": 0.028, |
| "num_tokens": 9569032.0, |
| "reward": 0.34234681725502014, |
| "reward_std": 0.5154464840888977, |
| "rewards/cosine_scaled_reward/mean": -0.12570157647132874, |
| "rewards/cosine_scaled_reward/std": 0.37586960196495056, |
| "rewards/format_reward/mean": 0.59375, |
| "rewards/format_reward/std": 0.49501484632492065, |
| "step": 79 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.59375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1933.0, |
| "completions/mean_length": 1728.5, |
| "completions/mean_terminated_length": 1261.5384521484375, |
| "completions/min_length": 521.0, |
| "completions/min_terminated_length": 521.0, |
| "epoch": 0.09142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2202686369419098, |
| "learning_rate": 7.817671337095244e-07, |
| "loss": 0.0315, |
| "num_tokens": 9690048.0, |
| "reward": 0.14108766615390778, |
| "reward_std": 0.6474246382713318, |
| "rewards/cosine_scaled_reward/mean": -0.1482061743736267, |
| "rewards/cosine_scaled_reward/std": 0.35231441259384155, |
| "rewards/format_reward/mean": 0.4375, |
| "rewards/format_reward/std": 0.5, |
| "step": 80 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1512.5, |
| "completions/mean_terminated_length": 942.4515991210938, |
| "completions/min_length": 276.0, |
| "completions/min_terminated_length": 276.0, |
| "epoch": 0.09257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2079077959060669, |
| "learning_rate": 7.75e-07, |
| "loss": 0.0289, |
| "num_tokens": 9798120.0, |
| "reward": 0.18197788298130035, |
| "reward_std": 0.6896297931671143, |
| "rewards/cosine_scaled_reward/mean": -0.17463606595993042, |
| "rewards/cosine_scaled_reward/std": 0.33339765667915344, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 81 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1964.0, |
| "completions/mean_length": 1331.59375, |
| "completions/mean_terminated_length": 981.720947265625, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.09371428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17440767586231232, |
| "learning_rate": 7.681643291108517e-07, |
| "loss": 0.0501, |
| "num_tokens": 9893670.0, |
| "reward": 0.6656259894371033, |
| "reward_std": 0.47437405586242676, |
| "rewards/cosine_scaled_reward/mean": -0.010936971753835678, |
| "rewards/cosine_scaled_reward/std": 0.51872718334198, |
| "rewards/format_reward/mean": 0.6875, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 82 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.421875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1630.0, |
| "completions/mean_length": 1375.609375, |
| "completions/mean_terminated_length": 884.9459838867188, |
| "completions/min_length": 515.0, |
| "completions/min_terminated_length": 515.0, |
| "epoch": 0.09485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1706554889678955, |
| "learning_rate": 7.612622032536507e-07, |
| "loss": 0.0555, |
| "num_tokens": 9992797.0, |
| "reward": 0.5926185846328735, |
| "reward_std": 0.5033661127090454, |
| "rewards/cosine_scaled_reward/mean": -0.008378200232982635, |
| "rewards/cosine_scaled_reward/std": 0.43848997354507446, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 83 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.453125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2017.0, |
| "completions/mean_length": 1469.953125, |
| "completions/mean_terminated_length": 991.0, |
| "completions/min_length": 415.0, |
| "completions/min_terminated_length": 415.0, |
| "epoch": 0.096, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.162735253572464, |
| "learning_rate": 7.54295724882796e-07, |
| "loss": 0.0524, |
| "num_tokens": 10097570.0, |
| "reward": 0.305806964635849, |
| "reward_std": 0.7360225915908813, |
| "rewards/cosine_scaled_reward/mean": -0.1361590325832367, |
| "rewards/cosine_scaled_reward/std": 0.3629942834377289, |
| "rewards/format_reward/mean": 0.578125, |
| "rewards/format_reward/std": 0.49776285886764526, |
| "step": 84 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1938.0, |
| "completions/mean_length": 1538.546875, |
| "completions/mean_terminated_length": 1271.6905517578125, |
| "completions/min_length": 757.0, |
| "completions/min_terminated_length": 757.0, |
| "epoch": 0.09714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1640138328075409, |
| "learning_rate": 7.472670160550848e-07, |
| "loss": 0.0237, |
| "num_tokens": 10206357.0, |
| "reward": 0.5352065563201904, |
| "reward_std": 0.6669929027557373, |
| "rewards/cosine_scaled_reward/mean": -0.10739670693874359, |
| "rewards/cosine_scaled_reward/std": 0.3617566227912903, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 85 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1729.0, |
| "completions/mean_length": 1533.96875, |
| "completions/mean_terminated_length": 1051.0909423828125, |
| "completions/min_length": 487.0, |
| "completions/min_terminated_length": 487.0, |
| "epoch": 0.09828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.20526152849197388, |
| "learning_rate": 7.401782177833147e-07, |
| "loss": 0.0296, |
| "num_tokens": 10315387.0, |
| "reward": 0.17176683247089386, |
| "reward_std": 0.5576786994934082, |
| "rewards/cosine_scaled_reward/mean": -0.17974159121513367, |
| "rewards/cosine_scaled_reward/std": 0.29362010955810547, |
| "rewards/format_reward/mean": 0.53125, |
| "rewards/format_reward/std": 0.5029674172401428, |
| "step": 86 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.265625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1923.0, |
| "completions/mean_length": 1203.109375, |
| "completions/mean_terminated_length": 897.5106201171875, |
| "completions/min_length": 294.0, |
| "completions/min_terminated_length": 294.0, |
| "epoch": 0.09942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15400013327598572, |
| "learning_rate": 7.330314893841101e-07, |
| "loss": 0.0372, |
| "num_tokens": 10402506.0, |
| "reward": 0.40166229009628296, |
| "reward_std": 0.5953558683395386, |
| "rewards/cosine_scaled_reward/mean": -0.15854386985301971, |
| "rewards/cosine_scaled_reward/std": 0.33755984902381897, |
| "rewards/format_reward/mean": 0.71875, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 87 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1830.0, |
| "completions/mean_length": 1300.203125, |
| "completions/mean_terminated_length": 820.84619140625, |
| "completions/min_length": 143.0, |
| "completions/min_terminated_length": 143.0, |
| "epoch": 0.10057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18996815383434296, |
| "learning_rate": 7.258290078201731e-07, |
| "loss": 0.0892, |
| "num_tokens": 10496231.0, |
| "reward": 0.8728382587432861, |
| "reward_std": 0.9383659958839417, |
| "rewards/cosine_scaled_reward/mean": 0.13173162937164307, |
| "rewards/cosine_scaled_reward/std": 0.4831489026546478, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 88 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1840.0, |
| "completions/mean_length": 1399.203125, |
| "completions/mean_terminated_length": 1145.3260498046875, |
| "completions/min_length": 404.0, |
| "completions/min_terminated_length": 404.0, |
| "epoch": 0.10171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16587954759597778, |
| "learning_rate": 7.185729670371604e-07, |
| "loss": 0.0474, |
| "num_tokens": 10596780.0, |
| "reward": 0.5412895679473877, |
| "reward_std": 0.7176238894462585, |
| "rewards/cosine_scaled_reward/mean": -0.12779270112514496, |
| "rewards/cosine_scaled_reward/std": 0.4147184491157532, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 89 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2042.0, |
| "completions/mean_length": 1320.28125, |
| "completions/mean_terminated_length": 989.5, |
| "completions/min_length": 146.0, |
| "completions/min_terminated_length": 146.0, |
| "epoch": 0.10285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18331165611743927, |
| "learning_rate": 7.11265577295385e-07, |
| "loss": 0.0604, |
| "num_tokens": 10691278.0, |
| "reward": 0.4162590801715851, |
| "reward_std": 0.5778031349182129, |
| "rewards/cosine_scaled_reward/mean": -0.15905795991420746, |
| "rewards/cosine_scaled_reward/std": 0.30715692043304443, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 90 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.53125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2028.0, |
| "completions/mean_length": 1667.765625, |
| "completions/mean_terminated_length": 1236.8333740234375, |
| "completions/min_length": 350.0, |
| "completions/min_terminated_length": 350.0, |
| "epoch": 0.104, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18394418060779572, |
| "learning_rate": 7.039090644965509e-07, |
| "loss": 0.0414, |
| "num_tokens": 10808583.0, |
| "reward": 0.3724231719970703, |
| "reward_std": 0.7342937588691711, |
| "rewards/cosine_scaled_reward/mean": -0.08722592890262604, |
| "rewards/cosine_scaled_reward/std": 0.40396004915237427, |
| "rewards/format_reward/mean": 0.546875, |
| "rewards/format_reward/std": 0.501733124256134, |
| "step": 91 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1795.0, |
| "completions/mean_length": 1328.3125, |
| "completions/mean_terminated_length": 924.5853271484375, |
| "completions/min_length": 315.0, |
| "completions/min_terminated_length": 315.0, |
| "epoch": 0.10514285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.155538409948349, |
| "learning_rate": 6.965056695057204e-07, |
| "loss": 0.0317, |
| "num_tokens": 10903587.0, |
| "reward": 0.27152636647224426, |
| "reward_std": 0.41478919982910156, |
| "rewards/cosine_scaled_reward/mean": -0.20017430186271667, |
| "rewards/cosine_scaled_reward/std": 0.21147257089614868, |
| "rewards/format_reward/mean": 0.671875, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 92 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.703125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1591.0, |
| "completions/mean_length": 1794.3125, |
| "completions/mean_terminated_length": 1193.4736328125, |
| "completions/min_length": 525.0, |
| "completions/min_terminated_length": 525.0, |
| "epoch": 0.10628571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.22204875946044922, |
| "learning_rate": 6.890576474687263e-07, |
| "loss": 0.0395, |
| "num_tokens": 11029767.0, |
| "reward": -0.17630083858966827, |
| "reward_std": 0.37804993987083435, |
| "rewards/cosine_scaled_reward/mean": -0.2522129416465759, |
| "rewards/cosine_scaled_reward/std": 0.1775641143321991, |
| "rewards/format_reward/mean": 0.328125, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 93 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.5625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2006.0, |
| "completions/mean_length": 1590.546875, |
| "completions/mean_terminated_length": 1002.3928833007812, |
| "completions/min_length": 456.0, |
| "completions/min_terminated_length": 456.0, |
| "epoch": 0.10742857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.2089747041463852, |
| "learning_rate": 6.815672671252315e-07, |
| "loss": 0.0437, |
| "num_tokens": 11141330.0, |
| "reward": 0.2654608488082886, |
| "reward_std": 0.46520984172821045, |
| "rewards/cosine_scaled_reward/mean": -0.08601956069469452, |
| "rewards/cosine_scaled_reward/std": 0.44666990637779236, |
| "rewards/format_reward/mean": 0.4375, |
| "rewards/format_reward/std": 0.5, |
| "step": 94 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.578125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2034.0, |
| "completions/mean_length": 1713.703125, |
| "completions/mean_terminated_length": 1255.5926513671875, |
| "completions/min_length": 685.0, |
| "completions/min_terminated_length": 685.0, |
| "epoch": 0.10857142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.18688982725143433, |
| "learning_rate": 6.740368101176495e-07, |
| "loss": 0.0541, |
| "num_tokens": 11261535.0, |
| "reward": -0.05255071818828583, |
| "reward_std": 0.5581967234611511, |
| "rewards/cosine_scaled_reward/mean": -0.2450253665447235, |
| "rewards/cosine_scaled_reward/std": 0.26258015632629395, |
| "rewards/format_reward/mean": 0.4375, |
| "rewards/format_reward/std": 0.5, |
| "step": 95 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1974.0, |
| "completions/mean_length": 1330.15625, |
| "completions/mean_terminated_length": 1110.4080810546875, |
| "completions/min_length": 494.0, |
| "completions/min_terminated_length": 494.0, |
| "epoch": 0.10971428571428571, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16858014464378357, |
| "learning_rate": 6.664685702961344e-07, |
| "loss": 0.0578, |
| "num_tokens": 11357801.0, |
| "reward": 0.6634380221366882, |
| "reward_std": 0.605156421661377, |
| "rewards/cosine_scaled_reward/mean": -0.05890599265694618, |
| "rewards/cosine_scaled_reward/std": 0.43818399310112, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 96 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1821.0, |
| "completions/mean_length": 1436.109375, |
| "completions/mean_terminated_length": 1157.977294921875, |
| "completions/min_length": 506.0, |
| "completions/min_terminated_length": 506.0, |
| "epoch": 0.11085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16547614336013794, |
| "learning_rate": 6.588648530198504e-07, |
| "loss": 0.0574, |
| "num_tokens": 11459976.0, |
| "reward": 0.4811592698097229, |
| "reward_std": 0.7198842167854309, |
| "rewards/cosine_scaled_reward/mean": -0.15004536509513855, |
| "rewards/cosine_scaled_reward/std": 0.3795333206653595, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 97 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1975.0, |
| "completions/mean_length": 1367.375, |
| "completions/mean_terminated_length": 1034.9766845703125, |
| "completions/min_length": 378.0, |
| "completions/min_terminated_length": 378.0, |
| "epoch": 0.112, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16006368398666382, |
| "learning_rate": 6.512279744547392e-07, |
| "loss": 0.0562, |
| "num_tokens": 11558552.0, |
| "reward": 0.6184609532356262, |
| "reward_std": 0.6620975136756897, |
| "rewards/cosine_scaled_reward/mean": -0.05014452338218689, |
| "rewards/cosine_scaled_reward/std": 0.4317678213119507, |
| "rewards/format_reward/mean": 0.71875, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 98 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1973.0, |
| "completions/mean_length": 1430.109375, |
| "completions/mean_terminated_length": 1007.3421020507812, |
| "completions/min_length": 257.0, |
| "completions/min_terminated_length": 257.0, |
| "epoch": 0.11314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1656832993030548, |
| "learning_rate": 6.435602608679916e-07, |
| "loss": 0.0158, |
| "num_tokens": 11661247.0, |
| "reward": 0.5446015000343323, |
| "reward_std": 0.771472692489624, |
| "rewards/cosine_scaled_reward/mean": -0.06363675743341446, |
| "rewards/cosine_scaled_reward/std": 0.46424856781959534, |
| "rewards/format_reward/mean": 0.671875, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 99 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1241.515625, |
| "completions/mean_terminated_length": 1092.1666259765625, |
| "completions/min_length": 501.0, |
| "completions/min_terminated_length": 501.0, |
| "epoch": 0.11428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.156847283244133, |
| "learning_rate": 6.358640479194451e-07, |
| "loss": 0.0072, |
| "num_tokens": 11750416.0, |
| "reward": 0.8802791833877563, |
| "reward_std": 0.6878768801689148, |
| "rewards/cosine_scaled_reward/mean": 0.010452112182974815, |
| "rewards/cosine_scaled_reward/std": 0.44097819924354553, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 100 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1842.0, |
| "completions/mean_length": 1303.140625, |
| "completions/mean_terminated_length": 1054.854248046875, |
| "completions/min_length": 457.0, |
| "completions/min_terminated_length": 457.0, |
| "epoch": 0.11542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1940712332725525, |
| "learning_rate": 6.281416799501187e-07, |
| "loss": 0.0692, |
| "num_tokens": 11845249.0, |
| "reward": 0.6096305847167969, |
| "reward_std": 0.6024209260940552, |
| "rewards/cosine_scaled_reward/mean": -0.08580972999334335, |
| "rewards/cosine_scaled_reward/std": 0.4317456781864166, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 101 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1781.0, |
| "completions/mean_length": 1110.4375, |
| "completions/mean_terminated_length": 915.8490600585938, |
| "completions/min_length": 417.0, |
| "completions/min_terminated_length": 417.0, |
| "epoch": 0.11657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15823149681091309, |
| "learning_rate": 6.203955092681039e-07, |
| "loss": -0.01, |
| "num_tokens": 11926469.0, |
| "reward": 0.6660584211349487, |
| "reward_std": 0.6825114488601685, |
| "rewards/cosine_scaled_reward/mean": -0.09665828198194504, |
| "rewards/cosine_scaled_reward/std": 0.403474897146225, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 102 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2039.0, |
| "completions/mean_length": 1261.796875, |
| "completions/mean_terminated_length": 849.9761962890625, |
| "completions/min_length": 156.0, |
| "completions/min_terminated_length": 156.0, |
| "epoch": 0.11771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15815117955207825, |
| "learning_rate": 6.126278954320294e-07, |
| "loss": 0.034, |
| "num_tokens": 12017576.0, |
| "reward": 0.34080806374549866, |
| "reward_std": 0.6322569251060486, |
| "rewards/cosine_scaled_reward/mean": -0.19678348302841187, |
| "rewards/cosine_scaled_reward/std": 0.32747402787208557, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 103 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1834.0, |
| "completions/mean_length": 1391.296875, |
| "completions/mean_terminated_length": 941.9736938476562, |
| "completions/min_length": 376.0, |
| "completions/min_terminated_length": 376.0, |
| "epoch": 0.11885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19132234156131744, |
| "learning_rate": 6.048412045323164e-07, |
| "loss": 0.0082, |
| "num_tokens": 12117083.0, |
| "reward": 0.37928348779678345, |
| "reward_std": 0.49314552545547485, |
| "rewards/cosine_scaled_reward/mean": -0.1306707262992859, |
| "rewards/cosine_scaled_reward/std": 0.32499685883522034, |
| "rewards/format_reward/mean": 0.640625, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 104 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1541.0, |
| "completions/mean_length": 1213.046875, |
| "completions/mean_terminated_length": 886.3261108398438, |
| "completions/min_length": 484.0, |
| "completions/min_terminated_length": 484.0, |
| "epoch": 0.12, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14559786021709442, |
| "learning_rate": 5.97037808470444e-07, |
| "loss": 0.0717, |
| "num_tokens": 12206006.0, |
| "reward": 0.682517409324646, |
| "reward_std": 0.7797682285308838, |
| "rewards/cosine_scaled_reward/mean": -0.04155381768941879, |
| "rewards/cosine_scaled_reward/std": 0.46402302384376526, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 105 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1967.0, |
| "completions/mean_length": 1300.3125, |
| "completions/mean_terminated_length": 880.8779907226562, |
| "completions/min_length": 266.0, |
| "completions/min_terminated_length": 266.0, |
| "epoch": 0.12114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1580500453710556, |
| "learning_rate": 5.892200842364462e-07, |
| "loss": 0.0158, |
| "num_tokens": 12300058.0, |
| "reward": 0.8088675737380981, |
| "reward_std": 0.6575020551681519, |
| "rewards/cosine_scaled_reward/mean": 0.06849630177021027, |
| "rewards/cosine_scaled_reward/std": 0.44056057929992676, |
| "rewards/format_reward/mean": 0.671875, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 106 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1818.0, |
| "completions/mean_length": 1324.09375, |
| "completions/mean_terminated_length": 917.9999389648438, |
| "completions/min_length": 500.0, |
| "completions/min_terminated_length": 500.0, |
| "epoch": 0.12228571428571429, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1742219179868698, |
| "learning_rate": 5.813904131848564e-07, |
| "loss": 0.0609, |
| "num_tokens": 12395952.0, |
| "reward": 0.3355618119239807, |
| "reward_std": 0.6852389574050903, |
| "rewards/cosine_scaled_reward/mean": -0.19159409403800964, |
| "rewards/cosine_scaled_reward/std": 0.36729469895362854, |
| "rewards/format_reward/mean": 0.71875, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 107 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.40625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2046.0, |
| "completions/mean_length": 1475.21875, |
| "completions/mean_terminated_length": 1083.3157958984375, |
| "completions/min_length": 441.0, |
| "completions/min_terminated_length": 441.0, |
| "epoch": 0.12342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17198115587234497, |
| "learning_rate": 5.735511803093248e-07, |
| "loss": 0.0333, |
| "num_tokens": 12500726.0, |
| "reward": 0.4056100845336914, |
| "reward_std": 0.8831891417503357, |
| "rewards/cosine_scaled_reward/mean": -0.11750747263431549, |
| "rewards/cosine_scaled_reward/std": 0.43770650029182434, |
| "rewards/format_reward/mean": 0.640625, |
| "rewards/format_reward/std": 0.4836103618144989, |
| "step": 108 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1918.0, |
| "completions/mean_length": 1491.984375, |
| "completions/mean_terminated_length": 969.6666870117188, |
| "completions/min_length": 413.0, |
| "completions/min_terminated_length": 413.0, |
| "epoch": 0.12457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17869649827480316, |
| "learning_rate": 5.657047735161255e-07, |
| "loss": 0.0269, |
| "num_tokens": 12607461.0, |
| "reward": 0.43367689847946167, |
| "reward_std": 0.4224759340286255, |
| "rewards/cosine_scaled_reward/mean": -0.08003655076026917, |
| "rewards/cosine_scaled_reward/std": 0.4198642373085022, |
| "rewards/format_reward/mean": 0.59375, |
| "rewards/format_reward/std": 0.49501484632492065, |
| "step": 109 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1827.0, |
| "completions/mean_length": 1304.03125, |
| "completions/mean_terminated_length": 886.6829223632812, |
| "completions/min_length": 297.0, |
| "completions/min_terminated_length": 297.0, |
| "epoch": 0.12571428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16700071096420288, |
| "learning_rate": 5.578535828967777e-07, |
| "loss": 0.0425, |
| "num_tokens": 12701695.0, |
| "reward": 0.28147247433662415, |
| "reward_std": 0.5220406651496887, |
| "rewards/cosine_scaled_reward/mean": -0.21082626283168793, |
| "rewards/cosine_scaled_reward/std": 0.26231563091278076, |
| "rewards/format_reward/mean": 0.703125, |
| "rewards/format_reward/std": 0.4604927599430084, |
| "step": 110 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2015.0, |
| "completions/mean_length": 1238.453125, |
| "completions/mean_terminated_length": 1051.6346435546875, |
| "completions/min_length": 164.0, |
| "completions/min_terminated_length": 164.0, |
| "epoch": 0.12685714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1514654904603958, |
| "learning_rate": 5.5e-07, |
| "loss": 0.0223, |
| "num_tokens": 12791076.0, |
| "reward": 0.6152657270431519, |
| "reward_std": 0.724465012550354, |
| "rewards/cosine_scaled_reward/mean": -0.12986713647842407, |
| "rewards/cosine_scaled_reward/std": 0.3984290361404419, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 111 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.515625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1637.421875, |
| "completions/mean_terminated_length": 1200.3548583984375, |
| "completions/min_length": 493.0, |
| "completions/min_terminated_length": 493.0, |
| "epoch": 0.128, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.19140374660491943, |
| "learning_rate": 5.421464171032224e-07, |
| "loss": 0.0523, |
| "num_tokens": 12906967.0, |
| "reward": 0.3433418571949005, |
| "reward_std": 0.7630938291549683, |
| "rewards/cosine_scaled_reward/mean": -0.08614157140254974, |
| "rewards/cosine_scaled_reward/std": 0.39247801899909973, |
| "rewards/format_reward/mean": 0.515625, |
| "rewards/format_reward/std": 0.5037065148353577, |
| "step": 112 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1730.0, |
| "completions/mean_length": 1076.734375, |
| "completions/mean_terminated_length": 976.2586059570312, |
| "completions/min_length": 441.0, |
| "completions/min_terminated_length": 441.0, |
| "epoch": 0.12914285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1517946422100067, |
| "learning_rate": 5.342952264838747e-07, |
| "loss": 0.0395, |
| "num_tokens": 12986174.0, |
| "reward": 0.8363064527511597, |
| "reward_std": 0.7532539367675781, |
| "rewards/cosine_scaled_reward/mean": -0.034971803426742554, |
| "rewards/cosine_scaled_reward/std": 0.4582027494907379, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 113 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1983.0, |
| "completions/mean_length": 1117.0, |
| "completions/mean_terminated_length": 923.7736206054688, |
| "completions/min_length": 274.0, |
| "completions/min_terminated_length": 274.0, |
| "epoch": 0.13028571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13561034202575684, |
| "learning_rate": 5.264488196906752e-07, |
| "loss": -0.0089, |
| "num_tokens": 13067510.0, |
| "reward": 0.5447627305984497, |
| "reward_std": 0.5072149038314819, |
| "rewards/cosine_scaled_reward/mean": -0.18855616450309753, |
| "rewards/cosine_scaled_reward/std": 0.30408668518066406, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 114 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1981.0, |
| "completions/mean_length": 1386.25, |
| "completions/mean_terminated_length": 1039.6190185546875, |
| "completions/min_length": 341.0, |
| "completions/min_terminated_length": 341.0, |
| "epoch": 0.13142857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15800845623016357, |
| "learning_rate": 5.186095868151436e-07, |
| "loss": 0.0471, |
| "num_tokens": 13167446.0, |
| "reward": 0.3318045139312744, |
| "reward_std": 0.42589348554611206, |
| "rewards/cosine_scaled_reward/mean": -0.201285257935524, |
| "rewards/cosine_scaled_reward/std": 0.3381516635417938, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 115 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.46875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2022.0, |
| "completions/mean_length": 1590.546875, |
| "completions/mean_terminated_length": 1186.9117431640625, |
| "completions/min_length": 360.0, |
| "completions/min_terminated_length": 360.0, |
| "epoch": 0.13257142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1938212811946869, |
| "learning_rate": 5.107799157635538e-07, |
| "loss": 0.0594, |
| "num_tokens": 13280625.0, |
| "reward": 0.33110541105270386, |
| "reward_std": 0.8637042045593262, |
| "rewards/cosine_scaled_reward/mean": -0.14694729447364807, |
| "rewards/cosine_scaled_reward/std": 0.4159691035747528, |
| "rewards/format_reward/mean": 0.625, |
| "rewards/format_reward/std": 0.48795005679130554, |
| "step": 116 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2014.0, |
| "completions/mean_length": 1270.0625, |
| "completions/mean_terminated_length": 1052.239990234375, |
| "completions/min_length": 345.0, |
| "completions/min_terminated_length": 345.0, |
| "epoch": 0.1337142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1720697581768036, |
| "learning_rate": 5.02962191529556e-07, |
| "loss": 0.0298, |
| "num_tokens": 13372933.0, |
| "reward": 0.4778579771518707, |
| "reward_std": 0.7142170667648315, |
| "rewards/cosine_scaled_reward/mean": -0.15950849652290344, |
| "rewards/cosine_scaled_reward/std": 0.3388991951942444, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 117 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1997.0, |
| "completions/mean_length": 1366.125, |
| "completions/mean_terminated_length": 1078.2222900390625, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.13485714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15761680901050568, |
| "learning_rate": 4.951587954676837e-07, |
| "loss": -0.0003, |
| "num_tokens": 13470901.0, |
| "reward": 0.8694435954093933, |
| "reward_std": 0.7288352847099304, |
| "rewards/cosine_scaled_reward/mean": 0.05972181260585785, |
| "rewards/cosine_scaled_reward/std": 0.5152958035469055, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 118 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2048.0, |
| "completions/mean_length": 1087.609375, |
| "completions/mean_terminated_length": 888.2830200195312, |
| "completions/min_length": 298.0, |
| "completions/min_terminated_length": 298.0, |
| "epoch": 0.136, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1559121459722519, |
| "learning_rate": 4.873721045679706e-07, |
| "loss": -0.0042, |
| "num_tokens": 13550916.0, |
| "reward": 0.7770103216171265, |
| "reward_std": 0.6413853764533997, |
| "rewards/cosine_scaled_reward/mean": -0.04118231683969498, |
| "rewards/cosine_scaled_reward/std": 0.4513413906097412, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 119 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1954.0, |
| "completions/mean_length": 892.9375, |
| "completions/mean_terminated_length": 855.6773681640625, |
| "completions/min_length": 221.0, |
| "completions/min_terminated_length": 221.0, |
| "epoch": 0.13714285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15053324401378632, |
| "learning_rate": 4.79604490731896e-07, |
| "loss": 0.0091, |
| "num_tokens": 13618520.0, |
| "reward": 0.8417136073112488, |
| "reward_std": 0.5830849409103394, |
| "rewards/cosine_scaled_reward/mean": -0.06351819634437561, |
| "rewards/cosine_scaled_reward/std": 0.4826962351799011, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 120 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1640.0, |
| "completions/mean_length": 737.3125, |
| "completions/mean_terminated_length": 716.5079956054688, |
| "completions/min_length": 177.0, |
| "completions/min_terminated_length": 177.0, |
| "epoch": 0.1382857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.12602098286151886, |
| "learning_rate": 4.7185832004988133e-07, |
| "loss": -0.012, |
| "num_tokens": 13675404.0, |
| "reward": 1.0489320755004883, |
| "reward_std": 0.571333646774292, |
| "rewards/cosine_scaled_reward/mean": 0.02446599304676056, |
| "rewards/cosine_scaled_reward/std": 0.4796800911426544, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 121 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1387.875, |
| "completions/mean_terminated_length": 1109.1556396484375, |
| "completions/min_length": 378.0, |
| "completions/min_terminated_length": 378.0, |
| "epoch": 0.13942857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1755150556564331, |
| "learning_rate": 4.641359520805548e-07, |
| "loss": 0.0142, |
| "num_tokens": 13775788.0, |
| "reward": 0.535797119140625, |
| "reward_std": 0.7094154357910156, |
| "rewards/cosine_scaled_reward/mean": -0.1071014553308487, |
| "rewards/cosine_scaled_reward/std": 0.40171730518341064, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 122 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1906.0, |
| "completions/mean_length": 1303.28125, |
| "completions/mean_terminated_length": 1011.8695678710938, |
| "completions/min_length": 419.0, |
| "completions/min_terminated_length": 419.0, |
| "epoch": 0.14057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1657014787197113, |
| "learning_rate": 4.5643973913200837e-07, |
| "loss": 0.0521, |
| "num_tokens": 13869430.0, |
| "reward": 0.4624041020870209, |
| "reward_std": 0.4812185764312744, |
| "rewards/cosine_scaled_reward/mean": -0.15942296385765076, |
| "rewards/cosine_scaled_reward/std": 0.38984978199005127, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 123 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2021.0, |
| "completions/mean_length": 1265.578125, |
| "completions/mean_terminated_length": 1046.5, |
| "completions/min_length": 414.0, |
| "completions/min_terminated_length": 414.0, |
| "epoch": 0.1417142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13778002560138702, |
| "learning_rate": 4.4877202554526084e-07, |
| "loss": 0.0275, |
| "num_tokens": 13961379.0, |
| "reward": 0.772553563117981, |
| "reward_std": 0.6200233101844788, |
| "rewards/cosine_scaled_reward/mean": -0.004348240792751312, |
| "rewards/cosine_scaled_reward/std": 0.4568707346916199, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 124 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1370.4375, |
| "completions/mean_terminated_length": 1084.3555908203125, |
| "completions/min_length": 395.0, |
| "completions/min_terminated_length": 395.0, |
| "epoch": 0.14285714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15328042209148407, |
| "learning_rate": 4.4113514698014953e-07, |
| "loss": -0.0036, |
| "num_tokens": 14060015.0, |
| "reward": 0.751072883605957, |
| "reward_std": 0.6632312536239624, |
| "rewards/cosine_scaled_reward/mean": 0.0005364194512367249, |
| "rewards/cosine_scaled_reward/std": 0.4951366186141968, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 125 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.296875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2011.0, |
| "completions/mean_length": 1375.4375, |
| "completions/mean_terminated_length": 1091.4666748046875, |
| "completions/min_length": 541.0, |
| "completions/min_terminated_length": 541.0, |
| "epoch": 0.144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17353153228759766, |
| "learning_rate": 4.3353142970386557e-07, |
| "loss": 0.0196, |
| "num_tokens": 14159339.0, |
| "reward": 0.5659087300300598, |
| "reward_std": 0.7280570268630981, |
| "rewards/cosine_scaled_reward/mean": -0.0764206275343895, |
| "rewards/cosine_scaled_reward/std": 0.3900720179080963, |
| "rewards/format_reward/mean": 0.71875, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 126 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1968.0, |
| "completions/mean_length": 1305.984375, |
| "completions/mean_terminated_length": 1078.836669921875, |
| "completions/min_length": 646.0, |
| "completions/min_terminated_length": 646.0, |
| "epoch": 0.14514285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15398724377155304, |
| "learning_rate": 4.2596318988235037e-07, |
| "loss": 0.1028, |
| "num_tokens": 14254018.0, |
| "reward": 0.46388188004493713, |
| "reward_std": 0.730462372303009, |
| "rewards/cosine_scaled_reward/mean": -0.15868405997753143, |
| "rewards/cosine_scaled_reward/std": 0.3666206896305084, |
| "rewards/format_reward/mean": 0.78125, |
| "rewards/format_reward/std": 0.4166666865348816, |
| "step": 127 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1916.0, |
| "completions/mean_length": 1140.96875, |
| "completions/mean_terminated_length": 1029.5789794921875, |
| "completions/min_length": 503.0, |
| "completions/min_terminated_length": 503.0, |
| "epoch": 0.1462857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1500931680202484, |
| "learning_rate": 4.1843273287476854e-07, |
| "loss": 0.0234, |
| "num_tokens": 14337392.0, |
| "reward": 0.9014174342155457, |
| "reward_std": 0.7117189764976501, |
| "rewards/cosine_scaled_reward/mean": -0.018041294068098068, |
| "rewards/cosine_scaled_reward/std": 0.44308605790138245, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 128 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.34375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1829.0, |
| "completions/mean_length": 1510.8125, |
| "completions/mean_terminated_length": 1229.4285888671875, |
| "completions/min_length": 456.0, |
| "completions/min_terminated_length": 456.0, |
| "epoch": 0.14742857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17119824886322021, |
| "learning_rate": 4.1094235253127374e-07, |
| "loss": 0.0387, |
| "num_tokens": 14444788.0, |
| "reward": 0.49041426181793213, |
| "reward_std": 0.7674543857574463, |
| "rewards/cosine_scaled_reward/mean": -0.08291786164045334, |
| "rewards/cosine_scaled_reward/std": 0.4110357463359833, |
| "rewards/format_reward/mean": 0.65625, |
| "rewards/format_reward/std": 0.4787135720252991, |
| "step": 129 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1928.0, |
| "completions/mean_length": 1295.78125, |
| "completions/mean_terminated_length": 953.8636474609375, |
| "completions/min_length": 355.0, |
| "completions/min_terminated_length": 355.0, |
| "epoch": 0.14857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1408931016921997, |
| "learning_rate": 4.034943304942796e-07, |
| "loss": 0.0237, |
| "num_tokens": 14538222.0, |
| "reward": 0.3384738564491272, |
| "reward_std": 0.5595801472663879, |
| "rewards/cosine_scaled_reward/mean": -0.1979505866765976, |
| "rewards/cosine_scaled_reward/std": 0.307023286819458, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 130 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2013.0, |
| "completions/mean_length": 1266.53125, |
| "completions/mean_terminated_length": 1047.719970703125, |
| "completions/min_length": 322.0, |
| "completions/min_terminated_length": 322.0, |
| "epoch": 0.14971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15982288122177124, |
| "learning_rate": 3.9609093550344907e-07, |
| "loss": 0.051, |
| "num_tokens": 14629016.0, |
| "reward": 0.8806734085083008, |
| "reward_std": 0.7300256490707397, |
| "rewards/cosine_scaled_reward/mean": 0.0340866819024086, |
| "rewards/cosine_scaled_reward/std": 0.44312214851379395, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 131 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.390625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1772.0, |
| "completions/mean_length": 1407.046875, |
| "completions/mean_terminated_length": 996.1795043945312, |
| "completions/min_length": 64.0, |
| "completions/min_terminated_length": 64.0, |
| "epoch": 0.15085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1669837236404419, |
| "learning_rate": 3.8873442270461485e-07, |
| "loss": 0.0031, |
| "num_tokens": 14730131.0, |
| "reward": 0.5413260459899902, |
| "reward_std": 0.7315264940261841, |
| "rewards/cosine_scaled_reward/mean": -0.034024473279714584, |
| "rewards/cosine_scaled_reward/std": 0.49355971813201904, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 132 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1997.0, |
| "completions/mean_length": 1341.828125, |
| "completions/mean_terminated_length": 1020.8409423828125, |
| "completions/min_length": 454.0, |
| "completions/min_terminated_length": 454.0, |
| "epoch": 0.152, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1627449095249176, |
| "learning_rate": 3.8142703296283953e-07, |
| "loss": 0.057, |
| "num_tokens": 14827232.0, |
| "reward": 0.48581433296203613, |
| "reward_std": 0.6289799213409424, |
| "rewards/cosine_scaled_reward/mean": -0.13990533351898193, |
| "rewards/cosine_scaled_reward/std": 0.3319030702114105, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 133 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2023.0, |
| "completions/mean_length": 1087.90625, |
| "completions/mean_terminated_length": 1006.5423583984375, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.15314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13660290837287903, |
| "learning_rate": 3.7417099217982686e-07, |
| "loss": 0.0397, |
| "num_tokens": 14907426.0, |
| "reward": 1.1596651077270508, |
| "reward_std": 0.5051962733268738, |
| "rewards/cosine_scaled_reward/mean": 0.1032700166106224, |
| "rewards/cosine_scaled_reward/std": 0.5394149422645569, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 134 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1921.0, |
| "completions/mean_length": 809.203125, |
| "completions/mean_terminated_length": 789.5397338867188, |
| "completions/min_length": 304.0, |
| "completions/min_terminated_length": 304.0, |
| "epoch": 0.15428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14757691323757172, |
| "learning_rate": 3.6696851061588994e-07, |
| "loss": -0.0311, |
| "num_tokens": 14969687.0, |
| "reward": 1.3557740449905396, |
| "reward_std": 0.6381043791770935, |
| "rewards/cosine_scaled_reward/mean": 0.18569952249526978, |
| "rewards/cosine_scaled_reward/std": 0.48723727464675903, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 135 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1900.0, |
| "completions/mean_length": 1199.28125, |
| "completions/mean_terminated_length": 1095.0526123046875, |
| "completions/min_length": 402.0, |
| "completions/min_terminated_length": 402.0, |
| "epoch": 0.15542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15660341084003448, |
| "learning_rate": 3.5982178221668533e-07, |
| "loss": 0.0159, |
| "num_tokens": 15057113.0, |
| "reward": 0.8486931920051575, |
| "reward_std": 0.7802823781967163, |
| "rewards/cosine_scaled_reward/mean": -0.05221588909626007, |
| "rewards/cosine_scaled_reward/std": 0.46035289764404297, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 136 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2012.0, |
| "completions/mean_length": 1251.625, |
| "completions/mean_terminated_length": 1169.2413330078125, |
| "completions/min_length": 499.0, |
| "completions/min_terminated_length": 499.0, |
| "epoch": 0.15657142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13971418142318726, |
| "learning_rate": 3.5273298394491515e-07, |
| "loss": 0.021, |
| "num_tokens": 15147825.0, |
| "reward": 0.73081374168396, |
| "reward_std": 0.6755009293556213, |
| "rewards/cosine_scaled_reward/mean": -0.11115560680627823, |
| "rewards/cosine_scaled_reward/std": 0.3759666979312897, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 137 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1807.0, |
| "completions/mean_length": 1242.625, |
| "completions/mean_terminated_length": 974.1666870117188, |
| "completions/min_length": 366.0, |
| "completions/min_terminated_length": 366.0, |
| "epoch": 0.15771428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1405755877494812, |
| "learning_rate": 3.45704275117204e-07, |
| "loss": 0.0063, |
| "num_tokens": 15238665.0, |
| "reward": 0.6383824348449707, |
| "reward_std": 0.5606896281242371, |
| "rewards/cosine_scaled_reward/mean": -0.10268379747867584, |
| "rewards/cosine_scaled_reward/std": 0.4464382231235504, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 138 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2019.0, |
| "completions/mean_length": 1168.5625, |
| "completions/mean_terminated_length": 1125.3114013671875, |
| "completions/min_length": 329.0, |
| "completions/min_terminated_length": 329.0, |
| "epoch": 0.15885714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1392800360918045, |
| "learning_rate": 3.387377967463493e-07, |
| "loss": 0.0174, |
| "num_tokens": 15324093.0, |
| "reward": 0.819107174873352, |
| "reward_std": 0.6458143591880798, |
| "rewards/cosine_scaled_reward/mean": -0.07482142746448517, |
| "rewards/cosine_scaled_reward/std": 0.447048157453537, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 139 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1946.0, |
| "completions/mean_length": 1163.484375, |
| "completions/mean_terminated_length": 1037.125, |
| "completions/min_length": 429.0, |
| "completions/min_terminated_length": 429.0, |
| "epoch": 0.16, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.16070181131362915, |
| "learning_rate": 3.3183567088914833e-07, |
| "loss": 0.0484, |
| "num_tokens": 15408780.0, |
| "reward": 0.8035323619842529, |
| "reward_std": 0.5583758354187012, |
| "rewards/cosine_scaled_reward/mean": -0.04354630410671234, |
| "rewards/cosine_scaled_reward/std": 0.5074254870414734, |
| "rewards/format_reward/mean": 0.890625, |
| "rewards/format_reward/std": 0.3145764470100403, |
| "step": 140 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1854.0, |
| "completions/mean_length": 1113.328125, |
| "completions/mean_terminated_length": 1067.360595703125, |
| "completions/min_length": 456.0, |
| "completions/min_terminated_length": 456.0, |
| "epoch": 0.16114285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13795652985572815, |
| "learning_rate": 3.250000000000001e-07, |
| "loss": 0.0117, |
| "num_tokens": 15490593.0, |
| "reward": 0.875638484954834, |
| "reward_std": 0.5237586498260498, |
| "rewards/cosine_scaled_reward/mean": -0.04655580222606659, |
| "rewards/cosine_scaled_reward/std": 0.49675485491752625, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 141 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2040.0, |
| "completions/mean_length": 1533.078125, |
| "completions/mean_terminated_length": 1388.9000244140625, |
| "completions/min_length": 376.0, |
| "completions/min_terminated_length": 376.0, |
| "epoch": 0.16228571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17600150406360626, |
| "learning_rate": 3.182328662904756e-07, |
| "loss": 0.0284, |
| "num_tokens": 15599214.0, |
| "reward": 0.7818896770477295, |
| "reward_std": 0.8826224207878113, |
| "rewards/cosine_scaled_reward/mean": -0.04655518755316734, |
| "rewards/cosine_scaled_reward/std": 0.48367196321487427, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 142 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1773.0, |
| "completions/mean_length": 1053.65625, |
| "completions/mean_terminated_length": 1004.7540283203125, |
| "completions/min_length": 425.0, |
| "completions/min_terminated_length": 425.0, |
| "epoch": 0.16342857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14887015521526337, |
| "learning_rate": 3.115363310950578e-07, |
| "loss": -0.0172, |
| "num_tokens": 15677464.0, |
| "reward": 0.7807654738426208, |
| "reward_std": 0.6237885355949402, |
| "rewards/cosine_scaled_reward/mean": -0.10180474817752838, |
| "rewards/cosine_scaled_reward/std": 0.3677360713481903, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 143 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1881.0, |
| "completions/mean_length": 1199.390625, |
| "completions/mean_terminated_length": 1003.5577392578125, |
| "completions/min_length": 312.0, |
| "completions/min_terminated_length": 312.0, |
| "epoch": 0.16457142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1500861942768097, |
| "learning_rate": 3.0491243424323783e-07, |
| "loss": 0.0237, |
| "num_tokens": 15765713.0, |
| "reward": 1.0909240245819092, |
| "reward_std": 0.6815290451049805, |
| "rewards/cosine_scaled_reward/mean": 0.12358702719211578, |
| "rewards/cosine_scaled_reward/std": 0.5295576453208923, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 144 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1994.0, |
| "completions/mean_length": 999.78125, |
| "completions/mean_terminated_length": 910.9491577148438, |
| "completions/min_length": 351.0, |
| "completions/min_terminated_length": 351.0, |
| "epoch": 0.1657142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13538403809070587, |
| "learning_rate": 2.9836319343816397e-07, |
| "loss": 0.0454, |
| "num_tokens": 15839643.0, |
| "reward": 0.8412516713142395, |
| "reward_std": 0.5241255760192871, |
| "rewards/cosine_scaled_reward/mean": -0.05593665689229965, |
| "rewards/cosine_scaled_reward/std": 0.412396639585495, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 145 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 1116.875, |
| "completions/mean_terminated_length": 1071.0819091796875, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.16685714285714287, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13304923474788666, |
| "learning_rate": 2.918906036420294e-07, |
| "loss": 0.0238, |
| "num_tokens": 15921739.0, |
| "reward": 0.6987366676330566, |
| "reward_std": 0.6291457414627075, |
| "rewards/cosine_scaled_reward/mean": -0.12719416618347168, |
| "rewards/cosine_scaled_reward/std": 0.4025166630744934, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 146 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2037.0, |
| "completions/mean_length": 1376.234375, |
| "completions/mean_terminated_length": 1188.1400146484375, |
| "completions/min_length": 470.0, |
| "completions/min_terminated_length": 470.0, |
| "epoch": 0.168, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17383964359760284, |
| "learning_rate": 2.854966364683872e-07, |
| "loss": 0.0816, |
| "num_tokens": 16020602.0, |
| "reward": 0.6075379848480225, |
| "reward_std": 0.6172347068786621, |
| "rewards/cosine_scaled_reward/mean": -0.12591850757598877, |
| "rewards/cosine_scaled_reward/std": 0.35805603861808777, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 147 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1895.0, |
| "completions/mean_length": 1046.640625, |
| "completions/mean_terminated_length": 961.7796630859375, |
| "completions/min_length": 286.0, |
| "completions/min_terminated_length": 286.0, |
| "epoch": 0.16914285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1272667795419693, |
| "learning_rate": 2.791832395815782e-07, |
| "loss": 0.0166, |
| "num_tokens": 16098771.0, |
| "reward": 0.831100583076477, |
| "reward_std": 0.46476393938064575, |
| "rewards/cosine_scaled_reward/mean": -0.06101220101118088, |
| "rewards/cosine_scaled_reward/std": 0.3975098729133606, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 148 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.484375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1946.0, |
| "completions/mean_length": 1548.46875, |
| "completions/mean_terminated_length": 1079.212158203125, |
| "completions/min_length": 636.0, |
| "completions/min_terminated_length": 636.0, |
| "epoch": 0.1702857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17459169030189514, |
| "learning_rate": 2.729523361034538e-07, |
| "loss": 0.0461, |
| "num_tokens": 16209769.0, |
| "reward": 0.2916935384273529, |
| "reward_std": 0.7498115301132202, |
| "rewards/cosine_scaled_reward/mean": -0.15884071588516235, |
| "rewards/cosine_scaled_reward/std": 0.3723042905330658, |
| "rewards/format_reward/mean": 0.609375, |
| "rewards/format_reward/std": 0.4917473793029785, |
| "step": 149 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2035.0, |
| "completions/mean_length": 1178.09375, |
| "completions/mean_terminated_length": 1088.1033935546875, |
| "completions/min_length": 331.0, |
| "completions/min_terminated_length": 331.0, |
| "epoch": 0.17142857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13624173402786255, |
| "learning_rate": 2.6680582402757324e-07, |
| "loss": -0.0165, |
| "num_tokens": 16295671.0, |
| "reward": 0.8602047562599182, |
| "reward_std": 0.708452582359314, |
| "rewards/cosine_scaled_reward/mean": -0.06208515167236328, |
| "rewards/cosine_scaled_reward/std": 0.43239399790763855, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 150 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1794.0, |
| "completions/mean_length": 1131.15625, |
| "completions/mean_terminated_length": 961.370361328125, |
| "completions/min_length": 401.0, |
| "completions/min_terminated_length": 401.0, |
| "epoch": 0.17257142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15374499559402466, |
| "learning_rate": 2.6074557564105724e-07, |
| "loss": 0.0201, |
| "num_tokens": 16378745.0, |
| "reward": 1.0417982339859009, |
| "reward_std": 0.7430429458618164, |
| "rewards/cosine_scaled_reward/mean": 0.09121159464120865, |
| "rewards/cosine_scaled_reward/std": 0.4806567430496216, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 151 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1911.0, |
| "completions/mean_length": 1266.53125, |
| "completions/mean_terminated_length": 1006.0416870117188, |
| "completions/min_length": 170.0, |
| "completions/min_terminated_length": 170.0, |
| "epoch": 0.1737142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17976705729961395, |
| "learning_rate": 2.547734369542718e-07, |
| "loss": 0.059, |
| "num_tokens": 16470467.0, |
| "reward": 0.6073766946792603, |
| "reward_std": 0.7206203937530518, |
| "rewards/cosine_scaled_reward/mean": -0.07912418246269226, |
| "rewards/cosine_scaled_reward/std": 0.39534807205200195, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 152 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1983.0, |
| "completions/mean_length": 1211.15625, |
| "completions/mean_terminated_length": 1091.607177734375, |
| "completions/min_length": 379.0, |
| "completions/min_terminated_length": 379.0, |
| "epoch": 0.17485714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14580874145030975, |
| "learning_rate": 2.488912271385139e-07, |
| "loss": 0.0205, |
| "num_tokens": 16559125.0, |
| "reward": 0.5799474716186523, |
| "reward_std": 0.5585569143295288, |
| "rewards/cosine_scaled_reward/mean": -0.17096377909183502, |
| "rewards/cosine_scaled_reward/std": 0.288126140832901, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 153 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1349.578125, |
| "completions/mean_terminated_length": 1154.02001953125, |
| "completions/min_length": 572.0, |
| "completions/min_terminated_length": 572.0, |
| "epoch": 0.176, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16763651371002197, |
| "learning_rate": 2.4310073797187573e-07, |
| "loss": 0.0217, |
| "num_tokens": 16656562.0, |
| "reward": 0.4374058246612549, |
| "reward_std": 0.5803461670875549, |
| "rewards/cosine_scaled_reward/mean": -0.19535958766937256, |
| "rewards/cosine_scaled_reward/std": 0.36079293489456177, |
| "rewards/format_reward/mean": 0.828125, |
| "rewards/format_reward/std": 0.38025420904159546, |
| "step": 154 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1642.0, |
| "completions/mean_length": 1151.65625, |
| "completions/mean_terminated_length": 800.9130859375, |
| "completions/min_length": 309.0, |
| "completions/min_terminated_length": 309.0, |
| "epoch": 0.17714285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14943136274814606, |
| "learning_rate": 2.374037332934512e-07, |
| "loss": 0.0004, |
| "num_tokens": 16740892.0, |
| "reward": 0.7691766023635864, |
| "reward_std": 0.6439853310585022, |
| "rewards/cosine_scaled_reward/mean": -0.021661702543497086, |
| "rewards/cosine_scaled_reward/std": 0.463664174079895, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 155 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1956.0, |
| "completions/mean_length": 1292.609375, |
| "completions/mean_terminated_length": 997.021728515625, |
| "completions/min_length": 440.0, |
| "completions/min_terminated_length": 440.0, |
| "epoch": 0.1782857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.138059601187706, |
| "learning_rate": 2.3180194846605364e-07, |
| "loss": 0.0125, |
| "num_tokens": 16833723.0, |
| "reward": 0.7149533033370972, |
| "reward_std": 0.7902576923370361, |
| "rewards/cosine_scaled_reward/mean": -0.0018983632326126099, |
| "rewards/cosine_scaled_reward/std": 0.49385347962379456, |
| "rewards/format_reward/mean": 0.71875, |
| "rewards/format_reward/std": 0.4531635046005249, |
| "step": 156 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2043.0, |
| "completions/mean_length": 1275.4375, |
| "completions/mean_terminated_length": 1038.938720703125, |
| "completions/min_length": 367.0, |
| "completions/min_terminated_length": 367.0, |
| "epoch": 0.17942857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16267189383506775, |
| "learning_rate": 2.2629708984760706e-07, |
| "loss": 0.0604, |
| "num_tokens": 16925311.0, |
| "reward": 0.48121872544288635, |
| "reward_std": 0.7515869140625, |
| "rewards/cosine_scaled_reward/mean": -0.15782812237739563, |
| "rewards/cosine_scaled_reward/std": 0.34000325202941895, |
| "rewards/format_reward/mean": 0.796875, |
| "rewards/format_reward/std": 0.40550529956817627, |
| "step": 157 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1001.390625, |
| "completions/mean_terminated_length": 912.6949462890625, |
| "completions/min_length": 218.0, |
| "completions/min_terminated_length": 218.0, |
| "epoch": 0.18057142857142858, |
| "frac_reward_zero_std": 0.125, |
| "grad_norm": 0.13367342948913574, |
| "learning_rate": 2.2089083427137329e-07, |
| "loss": 0.0333, |
| "num_tokens": 16999520.0, |
| "reward": 0.9302408695220947, |
| "reward_std": 0.558702826499939, |
| "rewards/cosine_scaled_reward/mean": -0.011442087590694427, |
| "rewards/cosine_scaled_reward/std": 0.4976855218410492, |
| "rewards/format_reward/mean": 0.953125, |
| "rewards/format_reward/std": 0.21304203569889069, |
| "step": 158 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2019.0, |
| "completions/mean_length": 1350.71875, |
| "completions/mean_terminated_length": 1155.47998046875, |
| "completions/min_length": 396.0, |
| "completions/min_terminated_length": 396.0, |
| "epoch": 0.18171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1456945538520813, |
| "learning_rate": 2.1558482853517253e-07, |
| "loss": 0.025, |
| "num_tokens": 17097646.0, |
| "reward": 0.5350826978683472, |
| "reward_std": 0.5987731218338013, |
| "rewards/cosine_scaled_reward/mean": -0.20120865106582642, |
| "rewards/cosine_scaled_reward/std": 0.3128848373889923, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 159 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1889.0, |
| "completions/mean_length": 1075.953125, |
| "completions/mean_terminated_length": 1028.1474609375, |
| "completions/min_length": 439.0, |
| "completions/min_terminated_length": 439.0, |
| "epoch": 0.18285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14507974684238434, |
| "learning_rate": 2.1038068889975259e-07, |
| "loss": 0.0368, |
| "num_tokens": 17178091.0, |
| "reward": 1.2330971956253052, |
| "reward_std": 0.7280604243278503, |
| "rewards/cosine_scaled_reward/mean": 0.124361053109169, |
| "rewards/cosine_scaled_reward/std": 0.47822633385658264, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 160 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1999.0, |
| "completions/mean_length": 1179.53125, |
| "completions/mean_terminated_length": 979.1154174804688, |
| "completions/min_length": 335.0, |
| "completions/min_terminated_length": 335.0, |
| "epoch": 0.184, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15625452995300293, |
| "learning_rate": 2.0528000059645995e-07, |
| "loss": 0.0581, |
| "num_tokens": 17263573.0, |
| "reward": 0.6913028955459595, |
| "reward_std": 0.7251037359237671, |
| "rewards/cosine_scaled_reward/mean": -0.07622354477643967, |
| "rewards/cosine_scaled_reward/std": 0.4124097228050232, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 161 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2009.0, |
| "completions/mean_length": 1241.859375, |
| "completions/mean_terminated_length": 1016.1399536132812, |
| "completions/min_length": 345.0, |
| "completions/min_terminated_length": 345.0, |
| "epoch": 0.18514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1465344876050949, |
| "learning_rate": 2.0028431734436308e-07, |
| "loss": 0.0541, |
| "num_tokens": 17353356.0, |
| "reward": 0.5661511421203613, |
| "reward_std": 0.651351809501648, |
| "rewards/cosine_scaled_reward/mean": -0.14661191403865814, |
| "rewards/cosine_scaled_reward/std": 0.33989307284355164, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 162 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.28125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1764.0, |
| "completions/mean_length": 1244.15625, |
| "completions/mean_terminated_length": 929.6087036132812, |
| "completions/min_length": 390.0, |
| "completions/min_terminated_length": 390.0, |
| "epoch": 0.18628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16913798451423645, |
| "learning_rate": 1.9539516087697517e-07, |
| "loss": 0.028, |
| "num_tokens": 17444166.0, |
| "reward": 1.0750610828399658, |
| "reward_std": 0.7454421520233154, |
| "rewards/cosine_scaled_reward/mean": 0.17034301161766052, |
| "rewards/cosine_scaled_reward/std": 0.47072502970695496, |
| "rewards/format_reward/mean": 0.734375, |
| "rewards/format_reward/std": 0.44515693187713623, |
| "step": 163 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1824.0, |
| "completions/mean_length": 1044.09375, |
| "completions/mean_terminated_length": 940.2413940429688, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.18742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15672317147254944, |
| "learning_rate": 1.9061402047871833e-07, |
| "loss": 0.0608, |
| "num_tokens": 17521516.0, |
| "reward": 0.928094744682312, |
| "reward_std": 0.6140168309211731, |
| "rewards/cosine_scaled_reward/mean": 0.003109898418188095, |
| "rewards/cosine_scaled_reward/std": 0.44902321696281433, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 164 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2004.0, |
| "completions/mean_length": 1069.75, |
| "completions/mean_terminated_length": 1004.5333862304688, |
| "completions/min_length": 435.0, |
| "completions/min_terminated_length": 435.0, |
| "epoch": 0.18857142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14553649723529816, |
| "learning_rate": 1.8594235253127372e-07, |
| "loss": 0.0582, |
| "num_tokens": 17601396.0, |
| "reward": 1.1307251453399658, |
| "reward_std": 0.5828652381896973, |
| "rewards/cosine_scaled_reward/mean": 0.08098758012056351, |
| "rewards/cosine_scaled_reward/std": 0.5071448087692261, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 165 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1631.0, |
| "completions/mean_length": 1283.75, |
| "completions/mean_terminated_length": 1107.3846435546875, |
| "completions/min_length": 553.0, |
| "completions/min_terminated_length": 553.0, |
| "epoch": 0.18971428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15118341147899628, |
| "learning_rate": 1.8138158006995363e-07, |
| "loss": -0.0021, |
| "num_tokens": 17695132.0, |
| "reward": 0.9083728790283203, |
| "reward_std": 0.6904245615005493, |
| "rewards/cosine_scaled_reward/mean": 0.016686435788869858, |
| "rewards/cosine_scaled_reward/std": 0.4635255038738251, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 166 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.03125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1992.0, |
| "completions/mean_length": 1001.71875, |
| "completions/mean_terminated_length": 967.9677124023438, |
| "completions/min_length": 464.0, |
| "completions/min_terminated_length": 464.0, |
| "epoch": 0.19085714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.12545917928218842, |
| "learning_rate": 1.7693309235023127e-07, |
| "loss": 0.0616, |
| "num_tokens": 17770786.0, |
| "reward": 0.7054103016853333, |
| "reward_std": 0.5469927787780762, |
| "rewards/cosine_scaled_reward/mean": -0.13948234915733337, |
| "rewards/cosine_scaled_reward/std": 0.3140275478363037, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 167 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2000.0, |
| "completions/mean_length": 1226.34375, |
| "completions/mean_terminated_length": 1171.5667724609375, |
| "completions/min_length": 474.0, |
| "completions/min_terminated_length": 474.0, |
| "epoch": 0.192, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16034062206745148, |
| "learning_rate": 1.7259824442455923e-07, |
| "loss": -0.0032, |
| "num_tokens": 17860112.0, |
| "reward": 1.0590779781341553, |
| "reward_std": 0.6419005393981934, |
| "rewards/cosine_scaled_reward/mean": 0.037351518869400024, |
| "rewards/cosine_scaled_reward/std": 0.433667927980423, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 168 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1886.0, |
| "completions/mean_length": 1041.6875, |
| "completions/mean_terminated_length": 992.1966552734375, |
| "completions/min_length": 412.0, |
| "completions/min_terminated_length": 412.0, |
| "epoch": 0.19314285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13765358924865723, |
| "learning_rate": 1.6837835672960831e-07, |
| "loss": -0.0146, |
| "num_tokens": 17937404.0, |
| "reward": 1.4680163860321045, |
| "reward_std": 0.5853168964385986, |
| "rewards/cosine_scaled_reward/mean": 0.23400816321372986, |
| "rewards/cosine_scaled_reward/std": 0.5452130436897278, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 169 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1982.0, |
| "completions/mean_length": 1089.796875, |
| "completions/mean_terminated_length": 890.924560546875, |
| "completions/min_length": 318.0, |
| "completions/min_terminated_length": 318.0, |
| "epoch": 0.19428571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13031044602394104, |
| "learning_rate": 1.6427471468404952e-07, |
| "loss": 0.0316, |
| "num_tokens": 18016935.0, |
| "reward": 0.9512024521827698, |
| "reward_std": 0.4455436170101166, |
| "rewards/cosine_scaled_reward/mean": 0.03810122609138489, |
| "rewards/cosine_scaled_reward/std": 0.466457724571228, |
| "rewards/format_reward/mean": 0.875, |
| "rewards/format_reward/std": 0.3333333432674408, |
| "step": 170 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.25, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1960.0, |
| "completions/mean_length": 1223.53125, |
| "completions/mean_terminated_length": 948.7083740234375, |
| "completions/min_length": 432.0, |
| "completions/min_terminated_length": 432.0, |
| "epoch": 0.19542857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1566823422908783, |
| "learning_rate": 1.6028856829700258e-07, |
| "loss": -0.0008, |
| "num_tokens": 18105633.0, |
| "reward": 0.699189305305481, |
| "reward_std": 0.6079459190368652, |
| "rewards/cosine_scaled_reward/mean": -0.02540534734725952, |
| "rewards/cosine_scaled_reward/std": 0.4247443377971649, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 171 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.1875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1935.0, |
| "completions/mean_length": 1152.765625, |
| "completions/mean_terminated_length": 946.173095703125, |
| "completions/min_length": 164.0, |
| "completions/min_terminated_length": 164.0, |
| "epoch": 0.19657142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17446376383304596, |
| "learning_rate": 1.5642113178727193e-07, |
| "loss": 0.0925, |
| "num_tokens": 18190026.0, |
| "reward": 1.1308399438858032, |
| "reward_std": 0.7069583535194397, |
| "rewards/cosine_scaled_reward/mean": 0.13573244214057922, |
| "rewards/cosine_scaled_reward/std": 0.4778185784816742, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 172 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2029.0, |
| "completions/mean_length": 1048.71875, |
| "completions/mean_terminated_length": 905.96435546875, |
| "completions/min_length": 206.0, |
| "completions/min_terminated_length": 206.0, |
| "epoch": 0.1977142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13813698291778564, |
| "learning_rate": 1.5267358321348285e-07, |
| "loss": 0.0382, |
| "num_tokens": 18268456.0, |
| "reward": 0.727963387966156, |
| "reward_std": 0.5711731910705566, |
| "rewards/cosine_scaled_reward/mean": -0.0891432836651802, |
| "rewards/cosine_scaled_reward/std": 0.4088326096534729, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 173 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1869.0, |
| "completions/mean_length": 1010.03125, |
| "completions/mean_terminated_length": 958.9835815429688, |
| "completions/min_length": 423.0, |
| "completions/min_terminated_length": 423.0, |
| "epoch": 0.19885714285714284, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13912717998027802, |
| "learning_rate": 1.4904706411523448e-07, |
| "loss": 0.0267, |
| "num_tokens": 18343434.0, |
| "reward": 0.8880202770233154, |
| "reward_std": 0.7008156180381775, |
| "rewards/cosine_scaled_reward/mean": -0.040364816784858704, |
| "rewards/cosine_scaled_reward/std": 0.46977153420448303, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 174 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1985.0, |
| "completions/mean_length": 1065.96875, |
| "completions/mean_terminated_length": 1017.6720581054688, |
| "completions/min_length": 344.0, |
| "completions/min_terminated_length": 344.0, |
| "epoch": 0.2, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14181731641292572, |
| "learning_rate": 1.4554267916537495e-07, |
| "loss": 0.0015, |
| "num_tokens": 18422480.0, |
| "reward": 0.8645844459533691, |
| "reward_std": 0.5794019103050232, |
| "rewards/cosine_scaled_reward/mean": -0.03645776957273483, |
| "rewards/cosine_scaled_reward/std": 0.4564404785633087, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 175 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2009.0, |
| "completions/mean_length": 1199.859375, |
| "completions/mean_terminated_length": 962.3800048828125, |
| "completions/min_length": 293.0, |
| "completions/min_terminated_length": 293.0, |
| "epoch": 0.20114285714285715, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14228306710720062, |
| "learning_rate": 1.4216149583350755e-07, |
| "loss": 0.0401, |
| "num_tokens": 18510439.0, |
| "reward": 0.5378038883209229, |
| "reward_std": 0.6905298829078674, |
| "rewards/cosine_scaled_reward/mean": -0.13734807074069977, |
| "rewards/cosine_scaled_reward/std": 0.3545166850090027, |
| "rewards/format_reward/mean": 0.8125, |
| "rewards/format_reward/std": 0.39339789748191833, |
| "step": 176 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1136.078125, |
| "completions/mean_terminated_length": 967.2037353515625, |
| "completions/min_length": 341.0, |
| "completions/min_terminated_length": 341.0, |
| "epoch": 0.2022857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15204809606075287, |
| "learning_rate": 1.3890454406082956e-07, |
| "loss": 0.0195, |
| "num_tokens": 18593372.0, |
| "reward": 0.8706955313682556, |
| "reward_std": 0.561739444732666, |
| "rewards/cosine_scaled_reward/mean": 0.013472765684127808, |
| "rewards/cosine_scaled_reward/std": 0.5127261877059937, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 177 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.09375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1090.78125, |
| "completions/mean_terminated_length": 991.7586059570312, |
| "completions/min_length": 410.0, |
| "completions/min_terminated_length": 410.0, |
| "epoch": 0.20342857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14953841269016266, |
| "learning_rate": 1.3577281594640182e-07, |
| "loss": 0.0377, |
| "num_tokens": 18674718.0, |
| "reward": 1.023108720779419, |
| "reward_std": 0.6721357107162476, |
| "rewards/cosine_scaled_reward/mean": 0.042804330587387085, |
| "rewards/cosine_scaled_reward/std": 0.4879773259162903, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 178 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.328125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1739.0, |
| "completions/mean_length": 1336.46875, |
| "completions/mean_terminated_length": 988.9767456054688, |
| "completions/min_length": 450.0, |
| "completions/min_terminated_length": 450.0, |
| "epoch": 0.20457142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15529467165470123, |
| "learning_rate": 1.3276726544494571e-07, |
| "loss": 0.0596, |
| "num_tokens": 18771084.0, |
| "reward": 0.5006722211837769, |
| "reward_std": 0.8324818015098572, |
| "rewards/cosine_scaled_reward/mean": -0.12466391175985336, |
| "rewards/cosine_scaled_reward/std": 0.3854917585849762, |
| "rewards/format_reward/mean": 0.75, |
| "rewards/format_reward/std": 0.4364357888698578, |
| "step": 179 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2003.0, |
| "completions/mean_length": 1045.78125, |
| "completions/mean_terminated_length": 978.9667358398438, |
| "completions/min_length": 293.0, |
| "completions/min_terminated_length": 293.0, |
| "epoch": 0.2057142857142857, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16482438147068024, |
| "learning_rate": 1.2988880807625927e-07, |
| "loss": 0.0338, |
| "num_tokens": 18849190.0, |
| "reward": 1.5156805515289307, |
| "reward_std": 0.7658263444900513, |
| "rewards/cosine_scaled_reward/mean": 0.27346524596214294, |
| "rewards/cosine_scaled_reward/std": 0.4864564538002014, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 180 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.234375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1929.0, |
| "completions/mean_length": 1285.703125, |
| "completions/mean_terminated_length": 1052.346923828125, |
| "completions/min_length": 473.0, |
| "completions/min_terminated_length": 473.0, |
| "epoch": 0.20685714285714285, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15365898609161377, |
| "learning_rate": 1.2713832064634125e-07, |
| "loss": 0.0144, |
| "num_tokens": 18942947.0, |
| "reward": 0.8511902093887329, |
| "reward_std": 0.5781452059745789, |
| "rewards/cosine_scaled_reward/mean": -0.004092369228601456, |
| "rewards/cosine_scaled_reward/std": 0.5130707621574402, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 181 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.3125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2005.0, |
| "completions/mean_length": 1312.578125, |
| "completions/mean_terminated_length": 978.2954711914062, |
| "completions/min_length": 436.0, |
| "completions/min_terminated_length": 436.0, |
| "epoch": 0.208, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14789460599422455, |
| "learning_rate": 1.2451664098030743e-07, |
| "loss": -0.0197, |
| "num_tokens": 19036744.0, |
| "reward": 0.43990767002105713, |
| "reward_std": 0.5781359076499939, |
| "rewards/cosine_scaled_reward/mean": -0.16285866498947144, |
| "rewards/cosine_scaled_reward/std": 0.3335002362728119, |
| "rewards/format_reward/mean": 0.765625, |
| "rewards/format_reward/std": 0.42695629596710205, |
| "step": 182 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1984.0, |
| "completions/mean_length": 961.703125, |
| "completions/mean_terminated_length": 944.4603881835938, |
| "completions/min_length": 407.0, |
| "completions/min_terminated_length": 407.0, |
| "epoch": 0.20914285714285713, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13262300193309784, |
| "learning_rate": 1.220245676671809e-07, |
| "loss": -0.0021, |
| "num_tokens": 19108909.0, |
| "reward": 1.2695767879486084, |
| "reward_std": 0.678351640701294, |
| "rewards/cosine_scaled_reward/mean": 0.1504133641719818, |
| "rewards/cosine_scaled_reward/std": 0.47719594836235046, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 183 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.046875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1879.0, |
| "completions/mean_length": 1119.015625, |
| "completions/mean_terminated_length": 1073.3277587890625, |
| "completions/min_length": 196.0, |
| "completions/min_terminated_length": 196.0, |
| "epoch": 0.2102857142857143, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14593224227428436, |
| "learning_rate": 1.1966285981663407e-07, |
| "loss": 0.0004, |
| "num_tokens": 19191878.0, |
| "reward": 0.6385983228683472, |
| "reward_std": 0.41251492500305176, |
| "rewards/cosine_scaled_reward/mean": -0.18070080876350403, |
| "rewards/cosine_scaled_reward/std": 0.3674013912677765, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 184 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.0625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1734.0, |
| "completions/mean_length": 1069.359375, |
| "completions/mean_terminated_length": 1004.11669921875, |
| "completions/min_length": 317.0, |
| "completions/min_terminated_length": 317.0, |
| "epoch": 0.21142857142857144, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1318162977695465, |
| "learning_rate": 1.1743223682775649e-07, |
| "loss": 0.0032, |
| "num_tokens": 19270501.0, |
| "reward": 0.8208166360855103, |
| "reward_std": 0.5546972155570984, |
| "rewards/cosine_scaled_reward/mean": -0.08177915960550308, |
| "rewards/cosine_scaled_reward/std": 0.4313104748725891, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 185 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1826.0, |
| "completions/mean_length": 1249.34375, |
| "completions/mean_terminated_length": 1101.4444580078125, |
| "completions/min_length": 642.0, |
| "completions/min_terminated_length": 642.0, |
| "epoch": 0.21257142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15292362868785858, |
| "learning_rate": 1.1533337816991931e-07, |
| "loss": -0.0015, |
| "num_tokens": 19361155.0, |
| "reward": 0.9478594660758972, |
| "reward_std": 0.6171192526817322, |
| "rewards/cosine_scaled_reward/mean": 0.0520547591149807, |
| "rewards/cosine_scaled_reward/std": 0.46879369020462036, |
| "rewards/format_reward/mean": 0.84375, |
| "rewards/format_reward/std": 0.36596253514289856, |
| "step": 186 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.109375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1902.0, |
| "completions/mean_length": 1198.328125, |
| "completions/mean_terminated_length": 1093.982421875, |
| "completions/min_length": 439.0, |
| "completions/min_terminated_length": 439.0, |
| "epoch": 0.21371428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.1557895690202713, |
| "learning_rate": 1.1336692317580158e-07, |
| "loss": 0.0526, |
| "num_tokens": 19448272.0, |
| "reward": 0.8623722791671753, |
| "reward_std": 0.5490715503692627, |
| "rewards/cosine_scaled_reward/mean": -0.05318887531757355, |
| "rewards/cosine_scaled_reward/std": 0.4439024031162262, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 187 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2045.0, |
| "completions/mean_length": 1200.9375, |
| "completions/mean_terminated_length": 1062.3272705078125, |
| "completions/min_length": 385.0, |
| "completions/min_terminated_length": 385.0, |
| "epoch": 0.21485714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17476554214954376, |
| "learning_rate": 1.1153347084664419e-07, |
| "loss": 0.0307, |
| "num_tokens": 19536932.0, |
| "reward": 0.4879753887653351, |
| "reward_std": 0.5241237282752991, |
| "rewards/cosine_scaled_reward/mean": -0.21694980561733246, |
| "rewards/cosine_scaled_reward/std": 0.24976789951324463, |
| "rewards/format_reward/mean": 0.921875, |
| "rewards/format_reward/std": 0.27048972249031067, |
| "step": 188 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.078125, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1620.0, |
| "completions/mean_length": 850.9375, |
| "completions/mean_terminated_length": 749.4915161132812, |
| "completions/min_length": 211.0, |
| "completions/min_terminated_length": 211.0, |
| "epoch": 0.216, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13221082091331482, |
| "learning_rate": 1.0983357966978745e-07, |
| "loss": 0.0039, |
| "num_tokens": 19600680.0, |
| "reward": 0.7197285890579224, |
| "reward_std": 0.6500153541564941, |
| "rewards/cosine_scaled_reward/mean": -0.12451067566871643, |
| "rewards/cosine_scaled_reward/std": 0.41736096143722534, |
| "rewards/format_reward/mean": 0.96875, |
| "rewards/format_reward/std": 0.17536810040473938, |
| "step": 189 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1999.0, |
| "completions/mean_length": 1033.84375, |
| "completions/mean_terminated_length": 1017.74609375, |
| "completions/min_length": 434.0, |
| "completions/min_terminated_length": 434.0, |
| "epoch": 0.21714285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.12019925564527512, |
| "learning_rate": 1.0826776744855121e-07, |
| "loss": -0.0065, |
| "num_tokens": 19676614.0, |
| "reward": 1.0075644254684448, |
| "reward_std": 0.5787118673324585, |
| "rewards/cosine_scaled_reward/mean": 0.003782205283641815, |
| "rewards/cosine_scaled_reward/std": 0.4871532618999481, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 190 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1841.0, |
| "completions/mean_length": 932.953125, |
| "completions/mean_terminated_length": 915.2540283203125, |
| "completions/min_length": 370.0, |
| "completions/min_terminated_length": 370.0, |
| "epoch": 0.21828571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13237522542476654, |
| "learning_rate": 1.068365111445064e-07, |
| "loss": -0.0122, |
| "num_tokens": 19746611.0, |
| "reward": 1.093684196472168, |
| "reward_std": 0.551490068435669, |
| "rewards/cosine_scaled_reward/mean": 0.054654598236083984, |
| "rewards/cosine_scaled_reward/std": 0.45679154992103577, |
| "rewards/format_reward/mean": 0.984375, |
| "rewards/format_reward/std": 0.125, |
| "step": 191 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.21875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1955.0, |
| "completions/mean_length": 1365.53125, |
| "completions/mean_terminated_length": 1174.43994140625, |
| "completions/min_length": 488.0, |
| "completions/min_terminated_length": 488.0, |
| "epoch": 0.21942857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.15830805897712708, |
| "learning_rate": 1.0554024673218806e-07, |
| "loss": 0.0555, |
| "num_tokens": 19845301.0, |
| "reward": 0.5910857915878296, |
| "reward_std": 0.5397718548774719, |
| "rewards/cosine_scaled_reward/mean": -0.1341446042060852, |
| "rewards/cosine_scaled_reward/std": 0.3709484338760376, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 192 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1987.0, |
| "completions/mean_length": 1498.09375, |
| "completions/mean_terminated_length": 1189.6097412109375, |
| "completions/min_length": 343.0, |
| "completions/min_terminated_length": 343.0, |
| "epoch": 0.22057142857142858, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.192939892411232, |
| "learning_rate": 1.0437936906629334e-07, |
| "loss": 0.0325, |
| "num_tokens": 19952323.0, |
| "reward": 0.4451579749584198, |
| "reward_std": 0.6873714923858643, |
| "rewards/cosine_scaled_reward/mean": -0.1133585125207901, |
| "rewards/cosine_scaled_reward/std": 0.39295101165771484, |
| "rewards/format_reward/mean": 0.671875, |
| "rewards/format_reward/std": 0.4732423722743988, |
| "step": 193 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.359375, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1884.0, |
| "completions/mean_length": 1450.703125, |
| "completions/mean_terminated_length": 1115.6341552734375, |
| "completions/min_length": 496.0, |
| "completions/min_terminated_length": 496.0, |
| "epoch": 0.22171428571428572, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.16667421162128448, |
| "learning_rate": 1.0335423176140511e-07, |
| "loss": 0.0186, |
| "num_tokens": 20056528.0, |
| "reward": 0.7569496035575867, |
| "reward_std": 0.7739458084106445, |
| "rewards/cosine_scaled_reward/mean": 0.03472479432821274, |
| "rewards/cosine_scaled_reward/std": 0.4840702414512634, |
| "rewards/format_reward/mean": 0.6875, |
| "rewards/format_reward/std": 0.467176616191864, |
| "step": 194 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.171875, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2038.0, |
| "completions/mean_length": 1248.28125, |
| "completions/mean_terminated_length": 1082.3018798828125, |
| "completions/min_length": 302.0, |
| "completions/min_terminated_length": 302.0, |
| "epoch": 0.22285714285714286, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14721666276454926, |
| "learning_rate": 1.0246514708427701e-07, |
| "loss": 0.0652, |
| "num_tokens": 20147562.0, |
| "reward": 0.750861406326294, |
| "reward_std": 0.7012320756912231, |
| "rewards/cosine_scaled_reward/mean": -0.07769429683685303, |
| "rewards/cosine_scaled_reward/std": 0.406501442193985, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 195 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1995.0, |
| "completions/mean_length": 1270.9375, |
| "completions/mean_terminated_length": 1127.0369873046875, |
| "completions/min_length": 485.0, |
| "completions/min_terminated_length": 485.0, |
| "epoch": 0.224, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.17459681630134583, |
| "learning_rate": 1.017123858587145e-07, |
| "loss": 0.0453, |
| "num_tokens": 20240494.0, |
| "reward": 0.7895511388778687, |
| "reward_std": 0.8231704235076904, |
| "rewards/cosine_scaled_reward/mean": -0.03491191938519478, |
| "rewards/cosine_scaled_reward/std": 0.452151894569397, |
| "rewards/format_reward/mean": 0.859375, |
| "rewards/format_reward/std": 0.3503824472427368, |
| "step": 196 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.015625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1922.0, |
| "completions/mean_length": 960.125, |
| "completions/mean_terminated_length": 942.857177734375, |
| "completions/min_length": 332.0, |
| "completions/min_terminated_length": 332.0, |
| "epoch": 0.22514285714285714, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13420052826404572, |
| "learning_rate": 1.0109617738307911e-07, |
| "loss": 0.0164, |
| "num_tokens": 20312310.0, |
| "reward": 1.2753715515136719, |
| "reward_std": 0.8778545260429382, |
| "rewards/cosine_scaled_reward/mean": 0.13768577575683594, |
| "rewards/cosine_scaled_reward/std": 0.5257315039634705, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 197 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 1935.0, |
| "completions/mean_length": 1202.5, |
| "completions/mean_terminated_length": 1045.9259033203125, |
| "completions/min_length": 360.0, |
| "completions/min_terminated_length": 360.0, |
| "epoch": 0.22628571428571428, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14797629415988922, |
| "learning_rate": 1.0061670936044178e-07, |
| "loss": 0.0582, |
| "num_tokens": 20400774.0, |
| "reward": 0.7811780571937561, |
| "reward_std": 0.6028587818145752, |
| "rewards/cosine_scaled_reward/mean": -0.06253597885370255, |
| "rewards/cosine_scaled_reward/std": 0.4776788353919983, |
| "rewards/format_reward/mean": 0.90625, |
| "rewards/format_reward/std": 0.29378482699394226, |
| "step": 198 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.140625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2010.0, |
| "completions/mean_length": 1291.5, |
| "completions/mean_terminated_length": 1167.7091064453125, |
| "completions/min_length": 558.0, |
| "completions/min_terminated_length": 558.0, |
| "epoch": 0.22742857142857142, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.13744986057281494, |
| "learning_rate": 1.002741278414069e-07, |
| "loss": -0.0106, |
| "num_tokens": 20494934.0, |
| "reward": 0.6700541377067566, |
| "reward_std": 0.7563885450363159, |
| "rewards/cosine_scaled_reward/mean": -0.1649729162454605, |
| "rewards/cosine_scaled_reward/std": 0.3852999210357666, |
| "rewards/format_reward/mean": 1.0, |
| "rewards/format_reward/std": 0.0, |
| "step": 199 |
| }, |
| { |
| "clip_ratio/high_max": 0.0, |
| "clip_ratio/high_mean": 0.0, |
| "clip_ratio/low_mean": 0.0, |
| "clip_ratio/low_min": 0.0, |
| "clip_ratio/region_mean": 0.0, |
| "completions/clipped_ratio": 0.15625, |
| "completions/max_length": 2048.0, |
| "completions/max_terminated_length": 2016.0, |
| "completions/mean_length": 1129.3125, |
| "completions/mean_terminated_length": 959.1851806640625, |
| "completions/min_length": 177.0, |
| "completions/min_terminated_length": 177.0, |
| "epoch": 0.22857142857142856, |
| "frac_reward_zero_std": 0.0, |
| "grad_norm": 0.14575229585170746, |
| "learning_rate": 1.0006853717962393e-07, |
| "loss": 0.0553, |
| "num_tokens": 20577330.0, |
| "reward": 1.0288422107696533, |
| "reward_std": 0.7649609446525574, |
| "rewards/cosine_scaled_reward/mean": 0.04567110538482666, |
| "rewards/cosine_scaled_reward/std": 0.48820799589157104, |
| "rewards/format_reward/mean": 0.9375, |
| "rewards/format_reward/std": 0.24397502839565277, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.22857142857142856, |
| "step": 200, |
| "total_flos": 0.0, |
| "train_loss": 0.03278075817739591, |
| "train_runtime": 11302.2127, |
| "train_samples_per_second": 1.133, |
| "train_steps_per_second": 0.018 |
| } |
| ], |
| "logging_steps": 1, |
| "max_steps": 200, |
| "num_input_tokens_seen": 20577330, |
| "num_train_epochs": 1, |
| "save_steps": 50, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": true |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 0.0, |
| "train_batch_size": 4, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|