{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999111111111111, "eval_steps": 500, "global_step": 562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 528.1953125, "completions/mean_terminated_length": 382.9595947265625, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.0035555555555555557, "grad_norm": 0.5908958042623822, "kl": 0.0, "learning_rate": 0.0, "loss": 0.0924, "num_tokens": 87001.0, "reward": 0.19921875, "reward_std": 0.1400640904903412, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 539.625, "completions/mean_terminated_length": 397.73736572265625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.0071111111111111115, "grad_norm": 0.5265386608630354, "kl": 0.0, "learning_rate": 2.941176470588235e-08, "loss": 0.048, "num_tokens": 175385.0, "reward": 0.2734375, "reward_std": 0.08956328779459, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 564.5703125, "completions/mean_terminated_length": 411.4270935058594, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.010666666666666666, "grad_norm": 0.6669684955633596, "kl": 0.0008296966552734375, "learning_rate": 5.88235294117647e-08, "loss": 0.0854, "num_tokens": 267090.0, "reward": 0.23828125, "reward_std": 0.13346019387245178, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 519.6484375, "completions/mean_terminated_length": 371.9090881347656, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.014222222222222223, "grad_norm": 0.8356994183639899, "kl": 0.0008006095886230469, "learning_rate": 8.823529411764706e-08, "loss": 0.0563, "num_tokens": 352981.0, "reward": 0.23046875, "reward_std": 0.1442507952451706, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 586.0078125, "completions/mean_terminated_length": 407.923095703125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.017777777777777778, "grad_norm": 0.5146228797786416, "kl": 0.0007901191711425781, "learning_rate": 1.176470588235294e-07, "loss": 0.1234, "num_tokens": 447406.0, "reward": 0.234375, "reward_std": 0.10518828779459, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 516.6796875, "completions/mean_terminated_length": 374.6300048828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.021333333333333333, "grad_norm": 0.6502615858390776, "kl": 0.0009016990661621094, "learning_rate": 1.4705882352941175e-07, "loss": 0.0145, "num_tokens": 532985.0, "reward": 0.2578125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 546.1875, "completions/mean_terminated_length": 393.4845275878906, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.024888888888888887, "grad_norm": 0.5087427899036316, "kl": 0.0009918212890625, "learning_rate": 1.764705882352941e-07, "loss": 0.0832, "num_tokens": 622297.0, "reward": 0.2890625, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 528.171875, "completions/mean_terminated_length": 401.7843322753906, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.028444444444444446, "grad_norm": 0.5571797599603905, "kl": 0.0008525848388671875, "learning_rate": 2.0588235294117645e-07, "loss": 0.0459, "num_tokens": 709263.0, "reward": 0.234375, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 624.609375, "completions/mean_terminated_length": 449.59552001953125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.032, "grad_norm": 0.5019588021071812, "kl": 0.0007739067077636719, "learning_rate": 2.352941176470588e-07, "loss": 0.0511, "num_tokens": 808689.0, "reward": 0.18359375, "reward_std": 0.10639689117670059, "rewards/equation_reward_func/mean": 0.3671875, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 584.71875, "completions/mean_terminated_length": 385.04547119140625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.035555555555555556, "grad_norm": 0.4425480618990905, "kl": 0.0007648468017578125, "learning_rate": 2.6470588235294114e-07, "loss": 0.0534, "num_tokens": 902953.0, "reward": 0.2421875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 533.421875, "completions/mean_terminated_length": 402.2772216796875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.03911111111111111, "grad_norm": 0.6821642124471641, "kl": 0.0007953643798828125, "learning_rate": 2.941176470588235e-07, "loss": 0.0794, "num_tokens": 990707.0, "reward": 0.265625, "reward_std": 0.11904378235340118, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 495.7734375, "completions/mean_terminated_length": 361.1274719238281, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.042666666666666665, "grad_norm": 0.3033516807371372, "kl": 0.0008263587951660156, "learning_rate": 3.2352941176470586e-07, "loss": 0.0285, "num_tokens": 1073538.0, "reward": 0.2890625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 504.125, "completions/mean_terminated_length": 365.14849853515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.04622222222222222, "grad_norm": 0.6637221608955077, "kl": 0.0007605552673339844, "learning_rate": 3.529411764705882e-07, "loss": 0.0467, "num_tokens": 1157418.0, "reward": 0.26171875, "reward_std": 0.10639689117670059, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 491.9375, "completions/mean_terminated_length": 375.3905029296875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.049777777777777775, "grad_norm": 0.7771391561940891, "kl": 0.0011105537414550781, "learning_rate": 3.8235294117647053e-07, "loss": 0.0902, "num_tokens": 1239762.0, "reward": 0.2890625, "reward_std": 0.1388554871082306, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 599.8515625, "completions/mean_terminated_length": 452.51580810546875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.05333333333333334, "grad_norm": 0.5023453309047337, "kl": 0.0007839202880859375, "learning_rate": 4.117647058823529e-07, "loss": 0.0137, "num_tokens": 1335947.0, "reward": 0.2109375, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 527.3046875, "completions/mean_terminated_length": 381.8080749511719, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05688888888888889, "grad_norm": 0.502715236717961, "kl": 0.0008416175842285156, "learning_rate": 4.4117647058823526e-07, "loss": 0.0778, "num_tokens": 1422846.0, "reward": 0.27734375, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 578.7265625, "completions/mean_terminated_length": 430.3020935058594, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.060444444444444446, "grad_norm": 0.5739590694285229, "kl": 0.0008263587951660156, "learning_rate": 4.705882352941176e-07, "loss": 0.0857, "num_tokens": 1516403.0, "reward": 0.23828125, "reward_std": 0.16647969186306, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 642.5625, "completions/mean_terminated_length": 469.18182373046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.064, "grad_norm": 0.5133981839793366, "kl": 0.0007915496826171875, "learning_rate": 5e-07, "loss": 0.0137, "num_tokens": 1618103.0, "reward": 0.1875, "reward_std": 0.09439768642187119, "rewards/equation_reward_func/mean": 0.375, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 583.140625, "completions/mean_terminated_length": 430.0000305175781, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.06755555555555555, "grad_norm": 0.5061139740383439, "kl": 0.0008330345153808594, "learning_rate": 4.999958464872182e-07, "loss": 0.0541, "num_tokens": 1712241.0, "reward": 0.234375, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 570.3125, "completions/mean_terminated_length": 378.75555419921875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.07111111111111111, "grad_norm": 0.4796470963634192, "kl": 0.0009145736694335938, "learning_rate": 4.999833860868863e-07, "loss": 0.101, "num_tokens": 1804665.0, "reward": 0.2421875, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 568.203125, "completions/mean_terminated_length": 428.6734619140625, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.07466666666666667, "grad_norm": 0.454066427267408, "kl": 0.0008668899536132812, "learning_rate": 4.999626192130396e-07, "loss": 0.0611, "num_tokens": 1896807.0, "reward": 0.23046875, "reward_std": 0.07514689117670059, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 567.828125, "completions/mean_terminated_length": 428.18365478515625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.07822222222222222, "grad_norm": 0.49542971737697244, "kl": 0.0009479522705078125, "learning_rate": 4.99933546555722e-07, "loss": 0.0644, "num_tokens": 1988845.0, "reward": 0.25390625, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 562.1953125, "completions/mean_terminated_length": 426.919189453125, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.08177777777777778, "grad_norm": 0.4209248402836188, "kl": 0.0009112358093261719, "learning_rate": 4.998961690809627e-07, "loss": 0.0707, "num_tokens": 2080194.0, "reward": 0.2734375, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 556.703125, "completions/mean_terminated_length": 431.78216552734375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.08533333333333333, "grad_norm": 0.5067760059517418, "kl": 0.0009670257568359375, "learning_rate": 4.998504880307444e-07, "loss": 0.0744, "num_tokens": 2170796.0, "reward": 0.25, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 433.015625, "completions/mean_terminated_length": 348.58929443359375, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.08888888888888889, "grad_norm": 0.48671458071416024, "kl": 0.0012998580932617188, "learning_rate": 4.997965049229614e-07, "loss": 0.0311, "num_tokens": 2245590.0, "reward": 0.34375, "reward_std": 0.058313291519880295, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 481.3359375, "completions/mean_terminated_length": 349.6213684082031, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.09244444444444444, "grad_norm": 0.5656218086796014, "kl": 0.0011796951293945312, "learning_rate": 4.997342215513703e-07, "loss": 0.0348, "num_tokens": 2326597.0, "reward": 0.33984375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 634.15625, "completions/mean_terminated_length": 436.9411926269531, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.096, "grad_norm": 0.42273558010230877, "kl": 0.001247406005859375, "learning_rate": 4.99663639985529e-07, "loss": 0.0755, "num_tokens": 2427233.0, "reward": 0.21875, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 540.90625, "completions/mean_terminated_length": 366.1701965332031, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.09955555555555555, "grad_norm": 0.4341530606782327, "kl": 0.0013904571533203125, "learning_rate": 4.995847625707292e-07, "loss": 0.0464, "num_tokens": 2515821.0, "reward": 0.2734375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 585.265625, "completions/mean_terminated_length": 378.5057373046875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.10311111111111111, "grad_norm": 0.4018303205644928, "kl": 0.0012025833129882812, "learning_rate": 4.994975919279175e-07, "loss": 0.0859, "num_tokens": 2610123.0, "reward": 0.23046875, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.4609375, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 566.515625, "completions/mean_terminated_length": 387.5, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.10666666666666667, "grad_norm": 0.5158491908606221, "kl": 0.0013837814331054688, "learning_rate": 4.994021309536092e-07, "loss": 0.0288, "num_tokens": 2702073.0, "reward": 0.2421875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 590.109375, "completions/mean_terminated_length": 378.20928955078125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.11022222222222222, "grad_norm": 0.3494723423356756, "kl": 0.001361846923828125, "learning_rate": 4.992983828197911e-07, "loss": 0.0336, "num_tokens": 2796983.0, "reward": 0.234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 446.1640625, "completions/mean_terminated_length": 319.5904846191406, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.11377777777777778, "grad_norm": 0.5940018298701378, "kl": 0.001651763916015625, "learning_rate": 4.991863509738169e-07, "loss": 0.0808, "num_tokens": 2873436.0, "reward": 0.3203125, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 645.4375, "completions/mean_terminated_length": 473.3636474609375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.11733333333333333, "grad_norm": 0.30783057758617627, "kl": 0.0012383460998535156, "learning_rate": 4.990660391382923e-07, "loss": 0.021, "num_tokens": 2975516.0, "reward": 0.19921875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.3984375, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 644.484375, "completions/mean_terminated_length": 445.69049072265625, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.12088888888888889, "grad_norm": 0.3714414319586682, "kl": 0.0013628005981445312, "learning_rate": 4.989374513109511e-07, "loss": 0.0274, "num_tokens": 3077482.0, "reward": 0.22265625, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.4453125, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 528.609375, "completions/mean_terminated_length": 370.28863525390625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.12444444444444444, "grad_norm": 0.5450899553894905, "kl": 0.0016345977783203125, "learning_rate": 4.988005917645229e-07, "loss": 0.0743, "num_tokens": 3164492.0, "reward": 0.28125, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 576.796875, "completions/mean_terminated_length": 445.7979736328125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.128, "grad_norm": 0.5131577542418286, "kl": 0.0014829635620117188, "learning_rate": 4.986554650465906e-07, "loss": 0.0681, "num_tokens": 3257790.0, "reward": 0.25390625, "reward_std": 0.10397969186306, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 574.5703125, "completions/mean_terminated_length": 370.2840881347656, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.13155555555555556, "grad_norm": 0.3379689817002036, "kl": 0.001598358154296875, "learning_rate": 4.985020759794397e-07, "loss": 0.0141, "num_tokens": 3350711.0, "reward": 0.27734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 612.046875, "completions/mean_terminated_length": 424.79547119140625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.1351111111111111, "grad_norm": 0.39638945137645676, "kl": 0.0016222000122070312, "learning_rate": 4.983404296598978e-07, "loss": 0.0435, "num_tokens": 3448509.0, "reward": 0.2109375, "reward_std": 0.06073049083352089, "rewards/equation_reward_func/mean": 0.421875, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 511.1640625, "completions/mean_terminated_length": 340.21875, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.13866666666666666, "grad_norm": 0.5794520029696754, "kl": 0.0018186569213867188, "learning_rate": 4.981705314591655e-07, "loss": 0.044, "num_tokens": 3533358.0, "reward": 0.26953125, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 531.1015625, "completions/mean_terminated_length": 373.5773010253906, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.14222222222222222, "grad_norm": 0.4562238563268482, "kl": 0.0018701553344726562, "learning_rate": 4.979923870226372e-07, "loss": 0.043, "num_tokens": 3620695.0, "reward": 0.28515625, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 604.546875, "completions/mean_terminated_length": 413.8863830566406, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.14577777777777778, "grad_norm": 0.3478472447308095, "kl": 0.0015926361083984375, "learning_rate": 4.978060022697148e-07, "loss": 0.073, "num_tokens": 3717469.0, "reward": 0.25390625, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 608.875, "completions/mean_terminated_length": 398.87060546875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.14933333333333335, "grad_norm": 0.5522250614742202, "kl": 0.0018625259399414062, "learning_rate": 4.976113833936098e-07, "loss": 0.0955, "num_tokens": 3814777.0, "reward": 0.24609375, "reward_std": 0.10352964699268341, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 538.8046875, "completions/mean_terminated_length": 333.9444580078125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.15288888888888888, "grad_norm": 0.617559133328579, "kl": 0.0025587081909179688, "learning_rate": 4.974085368611381e-07, "loss": 0.0542, "num_tokens": 3903144.0, "reward": 0.28125, "reward_std": 0.09429339319467545, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 489.15625, "completions/mean_terminated_length": 365.73077392578125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.15644444444444444, "grad_norm": 0.3945474561913517, "kl": 0.0029430389404296875, "learning_rate": 4.971974694125051e-07, "loss": 0.0252, "num_tokens": 3985152.0, "reward": 0.328125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 596.890625, "completions/mean_terminated_length": 423.23077392578125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.16, "grad_norm": 0.47200084090922856, "kl": 0.002716064453125, "learning_rate": 4.969781880610813e-07, "loss": 0.008, "num_tokens": 4080974.0, "reward": 0.2578125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 557.875, "completions/mean_terminated_length": 382.45159912109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.16355555555555557, "grad_norm": 0.5432696452240244, "kl": 0.0032100677490234375, "learning_rate": 4.967507000931702e-07, "loss": 0.0715, "num_tokens": 4171754.0, "reward": 0.2734375, "reward_std": 0.08009214699268341, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 662.7109375, "completions/mean_terminated_length": 445.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.1671111111111111, "grad_norm": 0.38787950989505143, "kl": 0.00273895263671875, "learning_rate": 4.965150130677651e-07, "loss": 0.0783, "num_tokens": 4276013.0, "reward": 0.25, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 679.9765625, "completions/mean_terminated_length": 420.78082275390625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.17066666666666666, "grad_norm": 0.4121030344900999, "kl": 0.0033826828002929688, "learning_rate": 4.962711348162987e-07, "loss": 0.0817, "num_tokens": 4382454.0, "reward": 0.203125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.40625, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 565.875, "completions/mean_terminated_length": 334.1176452636719, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.17422222222222222, "grad_norm": 0.33584885647385543, "kl": 0.00396728515625, "learning_rate": 4.960190734423824e-07, "loss": 0.0591, "num_tokens": 4474214.0, "reward": 0.30078125, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 593.9609375, "completions/mean_terminated_length": 425.6847839355469, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.17777777777777778, "grad_norm": 0.4341239740500275, "kl": 0.0038127899169921875, "learning_rate": 4.957588373215373e-07, "loss": 0.0354, "num_tokens": 4569653.0, "reward": 0.28125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 598.46875, "completions/mean_terminated_length": 375.5714416503906, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.18133333333333335, "grad_norm": 0.4161366154835123, "kl": 0.0038471221923828125, "learning_rate": 4.954904351009156e-07, "loss": 0.0412, "num_tokens": 4665629.0, "reward": 0.25, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 617.5703125, "completions/mean_terminated_length": 452.3186950683594, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.18488888888888888, "grad_norm": 0.3766044367830938, "kl": 0.00446319580078125, "learning_rate": 4.952138756990142e-07, "loss": 0.0784, "num_tokens": 4764074.0, "reward": 0.25390625, "reward_std": 0.09737578779459, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 605.7890625, "completions/mean_terminated_length": 415.6932067871094, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.18844444444444444, "grad_norm": 0.5625688615894612, "kl": 0.0045680999755859375, "learning_rate": 4.949291683053768e-07, "loss": 0.0839, "num_tokens": 4861031.0, "reward": 0.27734375, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 514.5859375, "completions/mean_terminated_length": 330.32977294921875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.192, "grad_norm": 0.44480643569932893, "kl": 0.00530242919921875, "learning_rate": 4.946363223802901e-07, "loss": 0.0755, "num_tokens": 4946286.0, "reward": 0.3359375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 550.203125, "completions/mean_terminated_length": 385.6210632324219, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.19555555555555557, "grad_norm": 0.40709067628271267, "kl": 0.005573272705078125, "learning_rate": 4.943353476544681e-07, "loss": 0.0644, "num_tokens": 5036084.0, "reward": 0.33203125, "reward_std": 0.07514689117670059, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 556.0703125, "completions/mean_terminated_length": 386.8191223144531, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1991111111111111, "grad_norm": 0.581586116516997, "kl": 0.005886077880859375, "learning_rate": 4.940262541287302e-07, "loss": 0.0721, "num_tokens": 5126557.0, "reward": 0.328125, "reward_std": 0.10518828779459, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 614.1328125, "completions/mean_terminated_length": 441.0777893066406, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.20266666666666666, "grad_norm": 0.5671111826305489, "kl": 0.0063323974609375, "learning_rate": 4.937090520736671e-07, "loss": 0.0751, "num_tokens": 5224610.0, "reward": 0.2890625, "reward_std": 0.1256476789712906, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 615.8828125, "completions/mean_terminated_length": 386.9389953613281, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.20622222222222222, "grad_norm": 0.4358836994748318, "kl": 0.0069580078125, "learning_rate": 4.933837520293017e-07, "loss": 0.0335, "num_tokens": 5322815.0, "reward": 0.25390625, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 495.0625, "completions/mean_terminated_length": 353.6633605957031, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.20977777777777779, "grad_norm": 0.5538038947652355, "kl": 0.008365631103515625, "learning_rate": 4.930503648047367e-07, "loss": 0.0614, "num_tokens": 5405543.0, "reward": 0.34765625, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 697.75, "completions/mean_terminated_length": 427.4285888671875, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.21333333333333335, "grad_norm": 0.2906630805990007, "kl": 0.0069732666015625, "learning_rate": 4.927089014777972e-07, "loss": 0.0307, "num_tokens": 5514331.0, "reward": 0.20703125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 668.578125, "completions/mean_terminated_length": 455.32501220703125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.21688888888888888, "grad_norm": 0.3808946853447367, "kl": 0.007110595703125, "learning_rate": 4.923593733946614e-07, "loss": 0.025, "num_tokens": 5619337.0, "reward": 0.25390625, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 611.6015625, "completions/mean_terminated_length": 410.1976623535156, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.22044444444444444, "grad_norm": 0.42168222974248926, "kl": 0.00823211669921875, "learning_rate": 4.920017921694841e-07, "loss": 0.0709, "num_tokens": 5716966.0, "reward": 0.296875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 650.53125, "completions/mean_terminated_length": 403.1688232421875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.224, "grad_norm": 0.6149747055540251, "kl": 0.008392333984375, "learning_rate": 4.91636169684011e-07, "loss": 0.0643, "num_tokens": 5819698.0, "reward": 0.234375, "reward_std": 0.09198048710823059, "rewards/equation_reward_func/mean": 0.46875, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 626.5625, "completions/mean_terminated_length": 380.0506286621094, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.22755555555555557, "grad_norm": 0.439322705873041, "kl": 0.008251190185546875, "learning_rate": 4.912625180871833e-07, "loss": 0.0747, "num_tokens": 5919310.0, "reward": 0.28515625, "reward_std": 0.08175078779459, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 605.0390625, "completions/mean_terminated_length": 414.602294921875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.2311111111111111, "grad_norm": 0.6380829984207932, "kl": 0.0090484619140625, "learning_rate": 4.908808497947346e-07, "loss": 0.0796, "num_tokens": 6016115.0, "reward": 0.2890625, "reward_std": 0.1051882952451706, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 640.7578125, "completions/mean_terminated_length": 403.0506286621094, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.23466666666666666, "grad_norm": 0.48865757330971626, "kl": 0.0081634521484375, "learning_rate": 4.904911774887779e-07, "loss": 0.1205, "num_tokens": 6117516.0, "reward": 0.28125, "reward_std": 0.10760548710823059, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 650.9765625, "completions/mean_terminated_length": 369.9315185546875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.23822222222222222, "grad_norm": 0.3083233072558277, "kl": 0.00861358642578125, "learning_rate": 4.900935141173842e-07, "loss": 0.0187, "num_tokens": 6220269.0, "reward": 0.25390625, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 570.59375, "completions/mean_terminated_length": 371.9101257324219, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.24177777777777779, "grad_norm": 0.47697021789482175, "kl": 0.01125335693359375, "learning_rate": 4.896878728941531e-07, "loss": 0.1055, "num_tokens": 6312653.0, "reward": 0.3046875, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 611.640625, "completions/mean_terminated_length": 388.072265625, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.24533333333333332, "grad_norm": 0.305335566829751, "kl": 0.0103607177734375, "learning_rate": 4.892742672977722e-07, "loss": 0.0216, "num_tokens": 6410395.0, "reward": 0.28515625, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 586.15625, "completions/mean_terminated_length": 408.1318664550781, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.24888888888888888, "grad_norm": 0.49674021978112454, "kl": 0.01163482666015625, "learning_rate": 4.888527110715709e-07, "loss": 0.1073, "num_tokens": 6504771.0, "reward": 0.30078125, "reward_std": 0.10639688372612, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 581.9140625, "completions/mean_terminated_length": 402.16485595703125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.25244444444444447, "grad_norm": 0.4329272005721831, "kl": 0.01168060302734375, "learning_rate": 4.884232182230623e-07, "loss": 0.0424, "num_tokens": 6598640.0, "reward": 0.3046875, "reward_std": 0.08537658303976059, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 540.9453125, "completions/mean_terminated_length": 373.1473693847656, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.256, "grad_norm": 0.46644475634716426, "kl": 0.01280975341796875, "learning_rate": 4.879858030234789e-07, "loss": 0.0158, "num_tokens": 6687229.0, "reward": 0.3203125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 593.3125, "completions/mean_terminated_length": 367.71429443359375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.25955555555555554, "grad_norm": 0.4908065819406202, "kl": 0.0121307373046875, "learning_rate": 4.875404800072976e-07, "loss": 0.0903, "num_tokens": 6782577.0, "reward": 0.3046875, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 589.5078125, "completions/mean_terminated_length": 399.11236572265625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.26311111111111113, "grad_norm": 0.5461815549169872, "kl": 0.01345062255859375, "learning_rate": 4.870872639717572e-07, "loss": 0.0907, "num_tokens": 6877502.0, "reward": 0.3046875, "reward_std": 0.11662658303976059, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 672.296875, "completions/mean_terminated_length": 415.6486511230469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.26666666666666666, "grad_norm": 0.43393823749020094, "kl": 0.01262664794921875, "learning_rate": 4.866261699763664e-07, "loss": 0.0607, "num_tokens": 6983012.0, "reward": 0.26171875, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 694.21875, "completions/mean_terminated_length": 437.72222900390625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.2702222222222222, "grad_norm": 0.4005786072285817, "kl": 0.01300048828125, "learning_rate": 4.861572133424035e-07, "loss": 0.0619, "num_tokens": 7091320.0, "reward": 0.24609375, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 658.125, "completions/mean_terminated_length": 438.6000061035156, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.2737777777777778, "grad_norm": 0.5289934674477423, "kl": 0.0152740478515625, "learning_rate": 4.856804096524078e-07, "loss": 0.0807, "num_tokens": 7194960.0, "reward": 0.2734375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 573.484375, "completions/mean_terminated_length": 376.0674133300781, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2773333333333333, "grad_norm": 0.4984901866701426, "kl": 0.0174713134765625, "learning_rate": 4.851957747496606e-07, "loss": 0.0622, "num_tokens": 7287782.0, "reward": 0.31640625, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 580.875, "completions/mean_terminated_length": 379.4545593261719, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.2808888888888889, "grad_norm": 0.43726864140919364, "kl": 0.01889801025390625, "learning_rate": 4.847033247376605e-07, "loss": 0.0653, "num_tokens": 7381486.0, "reward": 0.31640625, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 630.3046875, "completions/mean_terminated_length": 394.0874938964844, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.28444444444444444, "grad_norm": 0.33118079029268543, "kl": 0.0195770263671875, "learning_rate": 4.842030759795866e-07, "loss": 0.0409, "num_tokens": 7481565.0, "reward": 0.28125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 609.2578125, "completions/mean_terminated_length": 434.14447021484375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.288, "grad_norm": 0.3267731560238877, "kl": 0.019378662109375, "learning_rate": 4.836950450977558e-07, "loss": 0.0812, "num_tokens": 7579046.0, "reward": 0.26953125, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 792.0, "completions/mean_length": 538.203125, "completions/mean_terminated_length": 333.0888977050781, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.29155555555555557, "grad_norm": 0.22793681302257102, "kl": 0.023040771484375, "learning_rate": 4.831792489730703e-07, "loss": 0.0266, "num_tokens": 7667300.0, "reward": 0.33984375, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 597.3671875, "completions/mean_terminated_length": 358.03656005859375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.2951111111111111, "grad_norm": 0.3445223190797135, "kl": 0.020843505859375, "learning_rate": 4.826557047444563e-07, "loss": 0.0404, "num_tokens": 7763147.0, "reward": 0.29296875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 870.0, "completions/mean_length": 620.296875, "completions/mean_terminated_length": 369.89874267578125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.2986666666666667, "grad_norm": 0.2752562278412603, "kl": 0.021240234375, "learning_rate": 4.821244298082951e-07, "loss": 0.0564, "num_tokens": 7861961.0, "reward": 0.28125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 618.1171875, "completions/mean_terminated_length": 382.6049499511719, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.3022222222222222, "grad_norm": 4.058137865855706, "kl": 0.0670166015625, "learning_rate": 4.815854418178445e-07, "loss": 0.0505, "num_tokens": 7960460.0, "reward": 0.30078125, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 603.3046875, "completions/mean_terminated_length": 315.4605407714844, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.30577777777777776, "grad_norm": 0.3048722926307626, "kl": 0.0247955322265625, "learning_rate": 4.810387586826527e-07, "loss": 0.0564, "num_tokens": 8057031.0, "reward": 0.2890625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 618.0859375, "completions/mean_terminated_length": 374.5375061035156, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.30933333333333335, "grad_norm": 0.42519742796754373, "kl": 0.02471923828125, "learning_rate": 4.804843985679626e-07, "loss": 0.0723, "num_tokens": 8155606.0, "reward": 0.296875, "reward_std": 0.07635548710823059, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 668.109375, "completions/mean_terminated_length": 454.57501220703125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.3128888888888889, "grad_norm": 0.3755980737520331, "kl": 0.02496337890625, "learning_rate": 4.799223798941089e-07, "loss": 0.0714, "num_tokens": 8260612.0, "reward": 0.2890625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 631.5390625, "completions/mean_terminated_length": 379.9615478515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.3164444444444444, "grad_norm": 0.4944433917274943, "kl": 0.0275726318359375, "learning_rate": 4.793527213359058e-07, "loss": 0.0682, "num_tokens": 8360861.0, "reward": 0.28125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 547.0234375, "completions/mean_terminated_length": 381.33685302734375, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.32, "grad_norm": 0.2653180009086222, "kl": 0.032012939453125, "learning_rate": 4.787754418220257e-07, "loss": 0.0358, "num_tokens": 8450268.0, "reward": 0.36328125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 481.9140625, "completions/mean_terminated_length": 323.1212158203125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.32355555555555554, "grad_norm": 0.5025563918324205, "kl": 0.02996826171875, "learning_rate": 4.781905605343716e-07, "loss": 0.0519, "num_tokens": 8531289.0, "reward": 0.359375, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 626.703125, "completions/mean_terminated_length": 396.1728515625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.32711111111111113, "grad_norm": 0.4571407338462829, "kl": 0.0293426513671875, "learning_rate": 4.775980969074385e-07, "loss": 0.1161, "num_tokens": 8630915.0, "reward": 0.28125, "reward_std": 0.11420939117670059, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 619.46875, "completions/mean_terminated_length": 368.5569763183594, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.33066666666666666, "grad_norm": 0.5795568313921826, "kl": 0.0264129638671875, "learning_rate": 4.769980706276687e-07, "loss": 0.0998, "num_tokens": 8729623.0, "reward": 0.26953125, "reward_std": 0.1015625, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 603.734375, "completions/mean_terminated_length": 391.1294250488281, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.3342222222222222, "grad_norm": 0.3726799355706054, "kl": 0.0304107666015625, "learning_rate": 4.7639050163279646e-07, "loss": 0.0597, "num_tokens": 8826277.0, "reward": 0.3203125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 494.109375, "completions/mean_terminated_length": 365.4951477050781, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.3377777777777778, "grad_norm": 0.41719180551990875, "kl": 0.034393310546875, "learning_rate": 4.757754101111867e-07, "loss": 0.0278, "num_tokens": 8908815.0, "reward": 0.3515625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 685.4921875, "completions/mean_terminated_length": 386.808837890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.3413333333333333, "grad_norm": 0.41428514561249796, "kl": 0.0258636474609375, "learning_rate": 4.751528165011633e-07, "loss": 0.0483, "num_tokens": 9015998.0, "reward": 0.26171875, "reward_std": 0.06854298710823059, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 649.84375, "completions/mean_terminated_length": 446.9879455566406, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.3448888888888889, "grad_norm": 0.5611786676336825, "kl": 0.0316162109375, "learning_rate": 4.7452274149033036e-07, "loss": 0.0991, "num_tokens": 9118566.0, "reward": 0.30078125, "reward_std": 0.10639689117670059, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 539.46875, "completions/mean_terminated_length": 384.6185302734375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.34844444444444445, "grad_norm": 0.49912428807698755, "kl": 0.033843994140625, "learning_rate": 4.738852060148848e-07, "loss": 0.041, "num_tokens": 9206994.0, "reward": 0.35546875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 650.4296875, "completions/mean_terminated_length": 359.875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.352, "grad_norm": 0.36141150551462675, "kl": 0.0316619873046875, "learning_rate": 4.7324023125892067e-07, "loss": 0.0394, "num_tokens": 9309665.0, "reward": 0.2734375, "reward_std": 0.047418396919965744, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 638.1484375, "completions/mean_terminated_length": 398.8227844238281, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.35555555555555557, "grad_norm": 0.4782158236016533, "kl": 0.02935791015625, "learning_rate": 4.7258783865372496e-07, "loss": 0.0815, "num_tokens": 9410800.0, "reward": 0.2890625, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 585.2265625, "completions/mean_terminated_length": 347.33734130859375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.3591111111111111, "grad_norm": 0.43167749972004266, "kl": 0.0333709716796875, "learning_rate": 4.719280498770659e-07, "loss": 0.0667, "num_tokens": 9505033.0, "reward": 0.3046875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 611.359375, "completions/mean_terminated_length": 409.83721923828125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.3626666666666667, "grad_norm": 0.4856490802735953, "kl": 0.034454345703125, "learning_rate": 4.712608868524726e-07, "loss": 0.0187, "num_tokens": 9602691.0, "reward": 0.3125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 634.6875, "completions/mean_terminated_length": 368.3157958984375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.3662222222222222, "grad_norm": 0.46774399337658157, "kl": 0.0301513671875, "learning_rate": 4.70586371748506e-07, "loss": 0.0693, "num_tokens": 9703327.0, "reward": 0.28515625, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 571.359375, "completions/mean_terminated_length": 334.26190185546875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.36977777777777776, "grad_norm": 0.5546889929256124, "kl": 0.0407257080078125, "learning_rate": 4.699045269780232e-07, "loss": 0.1315, "num_tokens": 9795885.0, "reward": 0.32421875, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 534.9140625, "completions/mean_terminated_length": 328.4111328125, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.37333333333333335, "grad_norm": 0.4558550604552232, "kl": 0.04144287109375, "learning_rate": 4.692153751974318e-07, "loss": 0.0219, "num_tokens": 9883714.0, "reward": 0.328125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 585.7265625, "completions/mean_terminated_length": 348.1084289550781, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.3768888888888889, "grad_norm": 0.2528681789268111, "kl": 0.042144775390625, "learning_rate": 4.685189393059377e-07, "loss": 0.0215, "num_tokens": 9978091.0, "reward": 0.30859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 550.0546875, "completions/mean_terminated_length": 357.3516540527344, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.3804444444444444, "grad_norm": 0.5414337594332409, "kl": 0.045440673828125, "learning_rate": 4.6781524244478374e-07, "loss": 0.0318, "num_tokens": 10067886.0, "reward": 0.33203125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 610.8203125, "completions/mean_terminated_length": 345.9615478515625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.384, "grad_norm": 0.5284618031034961, "kl": 0.037109375, "learning_rate": 4.6710430799648143e-07, "loss": 0.0966, "num_tokens": 10165515.0, "reward": 0.28125, "reward_std": 0.09198048710823059, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 800.0, "completions/mean_length": 668.6171875, "completions/mean_terminated_length": 383.3098449707031, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.38755555555555554, "grad_norm": 0.22935085829686072, "kl": 0.0354156494140625, "learning_rate": 4.663861595840332e-07, "loss": 0.0321, "num_tokens": 10270534.0, "reward": 0.2734375, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 479.1171875, "completions/mean_terminated_length": 340.2254943847656, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.39111111111111113, "grad_norm": 0.5408265324543865, "kl": 0.048675537109375, "learning_rate": 4.6566082107014795e-07, "loss": 0.1005, "num_tokens": 10351213.0, "reward": 0.375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 613.3125, "completions/mean_terminated_length": 412.7441711425781, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.39466666666666667, "grad_norm": 0.49874666045088994, "kl": 0.046173095703125, "learning_rate": 4.649283165564479e-07, "loss": 0.0629, "num_tokens": 10449041.0, "reward": 0.30859375, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 632.9140625, "completions/mean_terminated_length": 390.341796875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.3982222222222222, "grad_norm": 0.31655960346507145, "kl": 0.0451812744140625, "learning_rate": 4.6418867038266807e-07, "loss": 0.0394, "num_tokens": 10549470.0, "reward": 0.27734375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 579.9453125, "completions/mean_terminated_length": 322.2839660644531, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4017777777777778, "grad_norm": 0.4352484669765941, "kl": 0.043731689453125, "learning_rate": 4.6344190712584713e-07, "loss": 0.0848, "num_tokens": 10643159.0, "reward": 0.3125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 596.859375, "completions/mean_terminated_length": 380.7764892578125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.4053333333333333, "grad_norm": 0.3951026471626352, "kl": 0.04644775390625, "learning_rate": 4.6268805159951086e-07, "loss": 0.055, "num_tokens": 10738953.0, "reward": 0.30859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 581.7578125, "completions/mean_terminated_length": 365.7790832519531, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4088888888888889, "grad_norm": 0.5794060540538886, "kl": 0.050384521484375, "learning_rate": 4.619271288528478e-07, "loss": 0.0514, "num_tokens": 10832770.0, "reward": 0.30078125, "reward_std": 0.09077189117670059, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 567.0625, "completions/mean_terminated_length": 335.9058837890625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.41244444444444445, "grad_norm": 0.3285138688218659, "kl": 0.060150146484375, "learning_rate": 4.611591641698768e-07, "loss": 0.0232, "num_tokens": 10924742.0, "reward": 0.31640625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 554.8828125, "completions/mean_terminated_length": 378.3333435058594, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.416, "grad_norm": 0.3455451639691987, "kl": 0.045989990234375, "learning_rate": 4.6038418306860695e-07, "loss": 0.0554, "num_tokens": 11015203.0, "reward": 0.34375, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 710.0859375, "completions/mean_terminated_length": 386.20635986328125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.41955555555555557, "grad_norm": 0.2465126222436747, "kl": 0.0394287109375, "learning_rate": 4.596022113001894e-07, "loss": 0.0296, "num_tokens": 11125598.0, "reward": 0.2421875, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.484375, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 625.5, "completions/mean_terminated_length": 370.0513000488281, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.4231111111111111, "grad_norm": 0.4033702399842539, "kl": 0.04541015625, "learning_rate": 4.58813274848062e-07, "loss": 0.0279, "num_tokens": 11225038.0, "reward": 0.2734375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 581.90625, "completions/mean_terminated_length": 388.1797790527344, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.4266666666666667, "grad_norm": 0.42264658082590667, "kl": 0.049041748046875, "learning_rate": 4.5801739992708604e-07, "loss": 0.048, "num_tokens": 11318978.0, "reward": 0.328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 655.34375, "completions/mean_terminated_length": 411.1688232421875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.43022222222222223, "grad_norm": 0.6094671551674041, "kl": 0.04888916015625, "learning_rate": 4.572146129826746e-07, "loss": 0.1119, "num_tokens": 11422286.0, "reward": 0.28515625, "reward_std": 0.10639689117670059, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 560.1015625, "completions/mean_terminated_length": 392.3085021972656, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.43377777777777776, "grad_norm": 0.47078312674279843, "kl": 0.050872802734375, "learning_rate": 4.5640494068991454e-07, "loss": 0.0888, "num_tokens": 11513375.0, "reward": 0.3515625, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 591.5390625, "completions/mean_terminated_length": 348.93902587890625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.43733333333333335, "grad_norm": 155.56901247014852, "kl": 5.67572021484375, "learning_rate": 4.555884099526793e-07, "loss": 0.0873, "num_tokens": 11608416.0, "reward": 0.30078125, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 567.8359375, "completions/mean_terminated_length": 360.4886474609375, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.4408888888888889, "grad_norm": 0.5750310459585855, "kl": 0.054229736328125, "learning_rate": 4.547650479027361e-07, "loss": 0.0879, "num_tokens": 11700499.0, "reward": 0.328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 529.0234375, "completions/mean_terminated_length": 349.9893493652344, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.4444444444444444, "grad_norm": 0.5685720790639835, "kl": 0.055267333984375, "learning_rate": 4.53934881898843e-07, "loss": 0.0597, "num_tokens": 11787574.0, "reward": 0.34375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 614.6796875, "completions/mean_terminated_length": 360.7974853515625, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.448, "grad_norm": 0.37356202522827936, "kl": 0.051544189453125, "learning_rate": 4.5309793952584095e-07, "loss": 0.0359, "num_tokens": 11885661.0, "reward": 0.29296875, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 627.8515625, "completions/mean_terminated_length": 382.1392517089844, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.45155555555555554, "grad_norm": 0.44495090171258866, "kl": 0.0501708984375, "learning_rate": 4.5225424859373684e-07, "loss": 0.0928, "num_tokens": 11985430.0, "reward": 0.3046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 494.8359375, "completions/mean_terminated_length": 318.44793701171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.45511111111111113, "grad_norm": 0.4861829577719564, "kl": 0.052734375, "learning_rate": 4.514038371367791e-07, "loss": 0.0653, "num_tokens": 12068117.0, "reward": 0.36328125, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 649.375, "completions/mean_terminated_length": 376.0, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.45866666666666667, "grad_norm": 0.3322027474284715, "kl": 0.06036376953125, "learning_rate": 4.5054673341252657e-07, "loss": 0.0364, "num_tokens": 12170669.0, "reward": 0.28125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 679.640625, "completions/mean_terminated_length": 385.18841552734375, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.4622222222222222, "grad_norm": 0.4572037215164497, "kl": 0.047027587890625, "learning_rate": 4.496829659009095e-07, "loss": 0.1176, "num_tokens": 12277143.0, "reward": 0.26953125, "reward_std": 0.09979298710823059, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 602.34375, "completions/mean_terminated_length": 357.67901611328125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.4657777777777778, "grad_norm": 0.4232004082309366, "kl": 0.0789794921875, "learning_rate": 4.488125633032831e-07, "loss": 0.0808, "num_tokens": 12373655.0, "reward": 0.31640625, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 574.375, "completions/mean_terminated_length": 362.4827575683594, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.4693333333333333, "grad_norm": 0.4351813687560114, "kl": 0.056732177734375, "learning_rate": 4.479355545414738e-07, "loss": 0.0514, "num_tokens": 12466611.0, "reward": 0.328125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 527.6015625, "completions/mean_terminated_length": 333.35870361328125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.4728888888888889, "grad_norm": 0.5279732026913658, "kl": 0.05914306640625, "learning_rate": 4.470519687568185e-07, "loss": 0.0839, "num_tokens": 12553540.0, "reward": 0.3515625, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 635.84375, "completions/mean_terminated_length": 352.5946044921875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.47644444444444445, "grad_norm": 0.5052685542188035, "kl": 0.055328369140625, "learning_rate": 4.4616183530919604e-07, "loss": 0.0643, "num_tokens": 12654432.0, "reward": 0.28125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 590.6171875, "completions/mean_terminated_length": 347.5, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.48, "grad_norm": 0.4484118446690246, "kl": 0.056121826171875, "learning_rate": 4.452651837760515e-07, "loss": 0.0466, "num_tokens": 12749443.0, "reward": 0.3125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 538.0234375, "completions/mean_terminated_length": 382.7113342285156, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.48355555555555557, "grad_norm": 0.44153093391749104, "kl": 0.05999755859375, "learning_rate": 4.443620439514138e-07, "loss": 0.0777, "num_tokens": 12837686.0, "reward": 0.37109375, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 639.90625, "completions/mean_terminated_length": 401.6708984375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.4871111111111111, "grad_norm": 0.5720546277144067, "kl": 0.059478759765625, "learning_rate": 4.4345244584490535e-07, "loss": 0.1217, "num_tokens": 12939086.0, "reward": 0.296875, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 679.4921875, "completions/mean_terminated_length": 419.9315185546875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.49066666666666664, "grad_norm": 0.329474154400109, "kl": 0.058502197265625, "learning_rate": 4.4253641968074505e-07, "loss": 0.0607, "num_tokens": 13045505.0, "reward": 0.26953125, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 651.234375, "completions/mean_terminated_length": 370.383544921875, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.49422222222222223, "grad_norm": 0.4098839684966374, "kl": 0.06768798828125, "learning_rate": 4.41613995896744e-07, "loss": 0.0818, "num_tokens": 13148251.0, "reward": 0.27734375, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 540.1953125, "completions/mean_terminated_length": 378.9270935058594, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.49777777777777776, "grad_norm": 0.34554172929002813, "kl": 0.06378173828125, "learning_rate": 4.40685205143294e-07, "loss": 0.0303, "num_tokens": 13236768.0, "reward": 0.3671875, "reward_std": 0.04268829524517059, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 580.53125, "completions/mean_terminated_length": 305.4683532714844, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.5013333333333333, "grad_norm": 0.3688172474905321, "kl": 0.061492919921875, "learning_rate": 4.3975007828234914e-07, "loss": 0.0669, "num_tokens": 13330460.0, "reward": 0.3046875, "reward_std": 0.058313287794589996, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 578.0859375, "completions/mean_terminated_length": 382.6853942871094, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5048888888888889, "grad_norm": 0.30931377681113237, "kl": 0.060394287109375, "learning_rate": 4.3880864638640035e-07, "loss": 0.0146, "num_tokens": 13423887.0, "reward": 0.33203125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 615.1171875, "completions/mean_terminated_length": 385.743896484375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.5084444444444445, "grad_norm": 0.5341523507516942, "kl": 0.0673828125, "learning_rate": 4.37860940737443e-07, "loss": 0.1108, "num_tokens": 13522102.0, "reward": 0.2890625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 619.796875, "completions/mean_terminated_length": 315.2602844238281, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.512, "grad_norm": 0.41305536162859396, "kl": 0.062042236328125, "learning_rate": 4.3690699282593723e-07, "loss": 0.0898, "num_tokens": 13620868.0, "reward": 0.26953125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 565.3984375, "completions/mean_terminated_length": 356.9432067871094, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.5155555555555555, "grad_norm": 0.4811823406061691, "kl": 0.058807373046875, "learning_rate": 4.3594683434976186e-07, "loss": 0.0429, "num_tokens": 13712663.0, "reward": 0.30859375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 663.703125, "completions/mean_terminated_length": 365.1714172363281, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.5191111111111111, "grad_norm": 0.467468001270948, "kl": 0.057647705078125, "learning_rate": 4.3498049721316087e-07, "loss": 0.0611, "num_tokens": 13817021.0, "reward": 0.25, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.5, "rewards/equation_reward_func/std": 0.5019646286964417, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 653.421875, "completions/mean_terminated_length": 365.1944580078125, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5226666666666666, "grad_norm": 0.3418013446712315, "kl": 0.057098388671875, "learning_rate": 4.340080135256835e-07, "loss": 0.056, "num_tokens": 13920127.0, "reward": 0.2734375, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 541.2265625, "completions/mean_terminated_length": 344.93408203125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.5262222222222223, "grad_norm": 0.6339833617232565, "kl": 0.061370849609375, "learning_rate": 4.3302941560111716e-07, "loss": 0.0777, "num_tokens": 14008756.0, "reward": 0.33984375, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 527.5390625, "completions/mean_terminated_length": 375.56121826171875, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5297777777777778, "grad_norm": 0.3438312380077029, "kl": 0.06854248046875, "learning_rate": 4.3204473595641367e-07, "loss": 0.0187, "num_tokens": 14095593.0, "reward": 0.375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 562.0703125, "completions/mean_terminated_length": 344.3793029785156, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.5333333333333333, "grad_norm": 0.4508123578700561, "kl": 0.07012939453125, "learning_rate": 4.3105400731060896e-07, "loss": 0.0734, "num_tokens": 14186902.0, "reward": 0.328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 633.3125, "completions/mean_terminated_length": 406.6172790527344, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.5368888888888889, "grad_norm": 0.3459680955939099, "kl": 0.056427001953125, "learning_rate": 4.300572625837359e-07, "loss": 0.0398, "num_tokens": 14287422.0, "reward": 0.3046875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 565.4609375, "completions/mean_terminated_length": 386.0326232910156, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.5404444444444444, "grad_norm": 0.43164118728346484, "kl": 0.06219482421875, "learning_rate": 4.2905453489573007e-07, "loss": 0.068, "num_tokens": 14379241.0, "reward": 0.34375, "reward_std": 0.08295938372612, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 615.921875, "completions/mean_terminated_length": 336.7105407714844, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.544, "grad_norm": 0.5003631588216835, "kl": 0.06854248046875, "learning_rate": 4.280458575653296e-07, "loss": 0.0297, "num_tokens": 14477451.0, "reward": 0.28515625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 523.78125, "completions/mean_terminated_length": 335.5268859863281, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5475555555555556, "grad_norm": 0.5041557972475773, "kl": 0.06591796875, "learning_rate": 4.2703126410896815e-07, "loss": 0.0435, "num_tokens": 14563867.0, "reward": 0.3515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 576.546875, "completions/mean_terminated_length": 365.67816162109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.5511111111111111, "grad_norm": 0.46354543664024916, "kl": 0.0675048828125, "learning_rate": 4.2601078823966065e-07, "loss": 0.0591, "num_tokens": 14657049.0, "reward": 0.32421875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 547.6171875, "completions/mean_terminated_length": 323.11492919921875, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5546666666666666, "grad_norm": 0.48578806239601413, "kl": 0.060638427734375, "learning_rate": 4.249844638658837e-07, "loss": 0.0961, "num_tokens": 14746448.0, "reward": 0.33203125, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 491.1875, "completions/mean_terminated_length": 320.9071960449219, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.5582222222222222, "grad_norm": 0.5266763435256532, "kl": 0.072357177734375, "learning_rate": 4.2395232509044856e-07, "loss": 0.0163, "num_tokens": 14828708.0, "reward": 0.3671875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 565.1328125, "completions/mean_terminated_length": 371.3888854980469, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.5617777777777778, "grad_norm": 0.521632115358521, "kl": 0.06085205078125, "learning_rate": 4.229144062093679e-07, "loss": 0.071, "num_tokens": 14920521.0, "reward": 0.34375, "reward_std": 0.06865385174751282, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 554.7265625, "completions/mean_terminated_length": 349.08990478515625, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.5653333333333334, "grad_norm": 0.540491795981583, "kl": 0.1163330078125, "learning_rate": 4.218707417107166e-07, "loss": 0.0065, "num_tokens": 15010934.0, "reward": 0.32421875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 603.4921875, "completions/mean_terminated_length": 375.5060119628906, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.5688888888888889, "grad_norm": 0.29353490325809917, "kl": 0.052581787109375, "learning_rate": 4.208213662734852e-07, "loss": 0.0497, "num_tokens": 15107601.0, "reward": 0.3203125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 583.5625, "completions/mean_terminated_length": 368.4651184082031, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.5724444444444444, "grad_norm": 0.528206473932766, "kl": 0.07403564453125, "learning_rate": 4.197663147664281e-07, "loss": 0.0775, "num_tokens": 15201653.0, "reward": 0.3203125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 626.890625, "completions/mean_terminated_length": 396.4691467285156, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.576, "grad_norm": 0.5652806374301126, "kl": 0.059112548828125, "learning_rate": 4.187056222469046e-07, "loss": 0.0756, "num_tokens": 15301279.0, "reward": 0.296875, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 593.78125, "completions/mean_terminated_length": 391.03448486328125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.5795555555555556, "grad_norm": 0.30091593501355973, "kl": 0.06610107421875, "learning_rate": 4.1763932395971433e-07, "loss": 0.0173, "num_tokens": 15396663.0, "reward": 0.3203125, "reward_std": 0.024646097794175148, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 568.578125, "completions/mean_terminated_length": 369.01123046875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.5831111111111111, "grad_norm": 0.550349967678839, "kl": 0.063629150390625, "learning_rate": 4.1656745533592565e-07, "loss": 0.0814, "num_tokens": 15488833.0, "reward": 0.3359375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 574.234375, "completions/mean_terminated_length": 377.14605712890625, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5866666666666667, "grad_norm": 0.5548433627551859, "kl": 0.07196044921875, "learning_rate": 4.1549005199169887e-07, "loss": 0.0829, "num_tokens": 15581723.0, "reward": 0.34375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 558.2421875, "completions/mean_terminated_length": 338.74713134765625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.5902222222222222, "grad_norm": 0.40272898288870146, "kl": 0.0623779296875, "learning_rate": 4.1440714972710245e-07, "loss": 0.0506, "num_tokens": 15672558.0, "reward": 0.328125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 572.34375, "completions/mean_terminated_length": 381.6444396972656, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.5937777777777777, "grad_norm": 0.5371704225303989, "kl": 0.06085205078125, "learning_rate": 4.1331878452492366e-07, "loss": 0.0023, "num_tokens": 15765258.0, "reward": 0.32421875, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 542.640625, "completions/mean_terminated_length": 346.923095703125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.5973333333333334, "grad_norm": 0.5749234165770156, "kl": 0.0762939453125, "learning_rate": 4.122249925494726e-07, "loss": 0.0786, "num_tokens": 15854136.0, "reward": 0.34765625, "reward_std": 0.06612578779459, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 642.5625, "completions/mean_terminated_length": 381.5789489746094, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6008888888888889, "grad_norm": 0.4092965574947992, "kl": 0.0634765625, "learning_rate": 4.111258101453809e-07, "loss": 0.0382, "num_tokens": 15955892.0, "reward": 0.29296875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 530.9765625, "completions/mean_terminated_length": 330.5164794921875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.6044444444444445, "grad_norm": 0.568384174265898, "kl": 0.070037841796875, "learning_rate": 4.10021273836394e-07, "loss": 0.1011, "num_tokens": 16043201.0, "reward": 0.34765625, "reward_std": 0.08889809250831604, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 559.7109375, "completions/mean_terminated_length": 316.51190185546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.608, "grad_norm": 0.522380124526648, "kl": 0.0614013671875, "learning_rate": 4.0891142032415717e-07, "loss": 0.0901, "num_tokens": 16134224.0, "reward": 0.3203125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 530.8984375, "completions/mean_terminated_length": 352.54254150390625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6115555555555555, "grad_norm": 0.5375576304120275, "kl": 0.06427001953125, "learning_rate": 4.0779628648699647e-07, "loss": 0.1047, "num_tokens": 16221535.0, "reward": 0.3671875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 617.1328125, "completions/mean_terminated_length": 404.01190185546875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.6151111111111112, "grad_norm": 0.5212263058976341, "kl": 0.06268310546875, "learning_rate": 4.066759093786931e-07, "loss": 0.0673, "num_tokens": 16319896.0, "reward": 0.3046875, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 594.2109375, "completions/mean_terminated_length": 384.31396484375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.6186666666666667, "grad_norm": 0.4207304276524183, "kl": 0.061676025390625, "learning_rate": 4.055503262272521e-07, "loss": 0.0446, "num_tokens": 16415407.0, "reward": 0.33203125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 554.1015625, "completions/mean_terminated_length": 370.228271484375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.6222222222222222, "grad_norm": 0.5004076631911177, "kl": 0.067108154296875, "learning_rate": 4.044195744336656e-07, "loss": 0.0503, "num_tokens": 16505688.0, "reward": 0.34765625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 597.21875, "completions/mean_terminated_length": 388.79071044921875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.6257777777777778, "grad_norm": 0.3389085882378621, "kl": 0.06085205078125, "learning_rate": 4.0328369157066975e-07, "loss": 0.0248, "num_tokens": 16601548.0, "reward": 0.32421875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 622.484375, "completions/mean_terminated_length": 356.5454406738281, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.6293333333333333, "grad_norm": 0.47925380724531974, "kl": 0.05706787109375, "learning_rate": 4.021427153814965e-07, "loss": 0.0737, "num_tokens": 16700646.0, "reward": 0.29296875, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 522.6640625, "completions/mean_terminated_length": 400.9805908203125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.6328888888888888, "grad_norm": 0.6430338634877218, "kl": 0.127471923828125, "learning_rate": 4.009966837786194e-07, "loss": 0.0268, "num_tokens": 16786943.0, "reward": 0.3828125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 589.53125, "completions/mean_terminated_length": 406.0888977050781, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6364444444444445, "grad_norm": 0.4913534989424645, "kl": 0.05908203125, "learning_rate": 3.9984563484249355e-07, "loss": 0.0807, "num_tokens": 16881767.0, "reward": 0.34375, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 529.6328125, "completions/mean_terminated_length": 304.92047119140625, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.64, "grad_norm": 0.4201965802081948, "kl": 0.076263427734375, "learning_rate": 3.98689606820291e-07, "loss": 0.0809, "num_tokens": 16968848.0, "reward": 0.3359375, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 634.3671875, "completions/mean_terminated_length": 415.79266357421875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.6435555555555555, "grad_norm": 0.3419085558084264, "kl": 0.067108154296875, "learning_rate": 3.975286381246288e-07, "loss": 0.0384, "num_tokens": 17069451.0, "reward": 0.30859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 579.640625, "completions/mean_terminated_length": 346.8809509277344, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6471111111111111, "grad_norm": 0.2432958975319321, "kl": 0.060302734375, "learning_rate": 3.963627673322936e-07, "loss": 0.0202, "num_tokens": 17163045.0, "reward": 0.32421875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 610.0078125, "completions/mean_terminated_length": 377.7682800292969, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.6506666666666666, "grad_norm": 0.5281747398974956, "kl": 0.06878662109375, "learning_rate": 3.951920331829592e-07, "loss": 0.1272, "num_tokens": 17260534.0, "reward": 0.3125, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 680.5625, "completions/mean_terminated_length": 377.5294189453125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.6542222222222223, "grad_norm": 0.5092423430173848, "kl": 0.062042236328125, "learning_rate": 3.9401647457789977e-07, "loss": 0.0897, "num_tokens": 17367062.0, "reward": 0.26171875, "reward_std": 0.08416798710823059, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 588.578125, "completions/mean_terminated_length": 368.3058776855469, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.6577777777777778, "grad_norm": 0.5474348788113542, "kl": 0.0654296875, "learning_rate": 3.9283613057869683e-07, "loss": 0.104, "num_tokens": 17461792.0, "reward": 0.32421875, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5546875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 761.2734375, "completions/mean_terminated_length": 434.0175476074219, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.6613333333333333, "grad_norm": 0.3476635615649434, "kl": 0.056121826171875, "learning_rate": 3.9165104040594144e-07, "loss": 0.0497, "num_tokens": 17578703.0, "reward": 0.20703125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.4140625, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 844.0, "completions/mean_length": 632.2421875, "completions/mean_terminated_length": 317.7323913574219, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6648888888888889, "grad_norm": 0.35445397974792686, "kl": 0.060943603515625, "learning_rate": 3.9046124343793104e-07, "loss": 0.0598, "num_tokens": 17679042.0, "reward": 0.26953125, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 616.15625, "completions/mean_terminated_length": 354.71795654296875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6684444444444444, "grad_norm": 0.47776391810863, "kl": 0.058807373046875, "learning_rate": 3.8926677920936093e-07, "loss": 0.0607, "num_tokens": 17777294.0, "reward": 0.296875, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 510.7421875, "completions/mean_terminated_length": 360.3939514160156, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.672, "grad_norm": 0.45444224995700133, "kl": 0.06878662109375, "learning_rate": 3.880676874100106e-07, "loss": 0.0741, "num_tokens": 17862001.0, "reward": 0.37109375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 557.90625, "completions/mean_terminated_length": 368.3955993652344, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.6755555555555556, "grad_norm": 0.4317758230861406, "kl": 0.06640625, "learning_rate": 3.868640078834251e-07, "loss": 0.0929, "num_tokens": 17952793.0, "reward": 0.3515625, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 487.59375, "completions/mean_terminated_length": 323.38775634765625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.6791111111111111, "grad_norm": 0.39854173141452104, "kl": 0.0635986328125, "learning_rate": 3.856557806255907e-07, "loss": 0.0562, "num_tokens": 18034597.0, "reward": 0.3671875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 577.7421875, "completions/mean_terminated_length": 335.795166015625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.6826666666666666, "grad_norm": 0.3470499680189745, "kl": 0.064666748046875, "learning_rate": 3.844430457836064e-07, "loss": 0.033, "num_tokens": 18127936.0, "reward": 0.3125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 632.359375, "completions/mean_terminated_length": 381.3077087402344, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.6862222222222222, "grad_norm": 0.4499019190207098, "kl": 0.059234619140625, "learning_rate": 3.8322584365434934e-07, "loss": 0.0651, "num_tokens": 18228246.0, "reward": 0.296875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 569.1796875, "completions/mean_terminated_length": 369.87640380859375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.6897777777777778, "grad_norm": 7.162885444263123, "kl": 0.368682861328125, "learning_rate": 3.8200421468313646e-07, "loss": 0.0665, "num_tokens": 18320485.0, "reward": 0.3359375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 511.1796875, "completions/mean_terminated_length": 333.0421142578125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.6933333333333334, "grad_norm": 0.5294809419658657, "kl": 0.0576171875, "learning_rate": 3.807781994623802e-07, "loss": 0.1227, "num_tokens": 18405228.0, "reward": 0.3515625, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 468.09375, "completions/mean_terminated_length": 346.3238220214844, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.6968888888888889, "grad_norm": 0.4971459542513064, "kl": 0.065277099609375, "learning_rate": 3.7954783873023946e-07, "loss": 0.0652, "num_tokens": 18484520.0, "reward": 0.390625, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 575.7890625, "completions/mean_terminated_length": 341.01190185546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.7004444444444444, "grad_norm": 0.42981739994976315, "kl": 0.062347412109375, "learning_rate": 3.7831317336926674e-07, "loss": 0.0735, "num_tokens": 18577641.0, "reward": 0.32421875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 569.4296875, "completions/mean_terminated_length": 377.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.704, "grad_norm": 0.456067354034936, "kl": 0.05865478515625, "learning_rate": 3.7707424440504863e-07, "loss": 0.059, "num_tokens": 18669964.0, "reward": 0.34765625, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 672.796875, "completions/mean_terminated_length": 381.79998779296875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.7075555555555556, "grad_norm": 0.5325114435911877, "kl": 0.055572509765625, "learning_rate": 3.758310930048436e-07, "loss": 0.0779, "num_tokens": 18775538.0, "reward": 0.265625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 570.21875, "completions/mean_terminated_length": 363.9545593261719, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.7111111111111111, "grad_norm": 0.3338558303600543, "kl": 0.0621337890625, "learning_rate": 3.7458376047621356e-07, "loss": 0.0344, "num_tokens": 18867894.0, "reward": 0.328125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 546.484375, "completions/mean_terminated_length": 387.3125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.7146666666666667, "grad_norm": 0.4386753368569343, "kl": 0.059539794921875, "learning_rate": 3.733322882656511e-07, "loss": 0.0275, "num_tokens": 18957208.0, "reward": 0.3515625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 859.0, "completions/mean_length": 618.8046875, "completions/mean_terminated_length": 323.12164306640625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.7182222222222222, "grad_norm": 0.40685525401738737, "kl": 0.064117431640625, "learning_rate": 3.7207671795720296e-07, "loss": 0.0451, "num_tokens": 19055843.0, "reward": 0.28125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 635.4609375, "completions/mean_terminated_length": 386.3974304199219, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.7217777777777777, "grad_norm": 0.32437340068946896, "kl": 0.056396484375, "learning_rate": 3.7081709127108767e-07, "loss": 0.0263, "num_tokens": 19156558.0, "reward": 0.30078125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1013.0, "completions/mean_length": 481.2421875, "completions/mean_terminated_length": 349.5048522949219, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.7253333333333334, "grad_norm": 0.40890277369483813, "kl": 0.0653076171875, "learning_rate": 3.695534500623096e-07, "loss": 0.009, "num_tokens": 19237509.0, "reward": 0.37890625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 591.40625, "completions/mean_terminated_length": 394.7727355957031, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7288888888888889, "grad_norm": 0.33774583966103494, "kl": 0.054351806640625, "learning_rate": 3.68285836319268e-07, "loss": 0.0317, "num_tokens": 19332669.0, "reward": 0.32421875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 514.4140625, "completions/mean_terminated_length": 358.4183654785156, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.7324444444444445, "grad_norm": 0.5408597970318645, "kl": 0.05645751953125, "learning_rate": 3.6701429216236204e-07, "loss": 0.0894, "num_tokens": 19417906.0, "reward": 0.37109375, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 653.0859375, "completions/mean_terminated_length": 390.97332763671875, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.736, "grad_norm": 0.47720779580289074, "kl": 0.05377197265625, "learning_rate": 3.657388598425908e-07, "loss": 0.0591, "num_tokens": 19520921.0, "reward": 0.28515625, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 516.9375, "completions/mean_terminated_length": 326.1075134277344, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.7395555555555555, "grad_norm": 0.36168513410880665, "kl": 0.061065673828125, "learning_rate": 3.644595817401501e-07, "loss": 0.0634, "num_tokens": 19606553.0, "reward": 0.359375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 610.40625, "completions/mean_terminated_length": 378.3902282714844, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.7431111111111111, "grad_norm": 0.5380028855142595, "kl": 0.054931640625, "learning_rate": 3.631765003630233e-07, "loss": 0.0958, "num_tokens": 19704121.0, "reward": 0.296875, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 646.40625, "completions/mean_terminated_length": 396.3116760253906, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.7466666666666667, "grad_norm": 0.4183122156986094, "kl": 0.054443359375, "learning_rate": 3.6188965834556964e-07, "loss": 0.0606, "num_tokens": 19806261.0, "reward": 0.28515625, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 527.28125, "completions/mean_terminated_length": 332.9130554199219, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7502222222222222, "grad_norm": 0.5535571949568251, "kl": 0.057220458984375, "learning_rate": 3.605990984471073e-07, "loss": 0.0605, "num_tokens": 19893037.0, "reward": 0.33203125, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 572.6171875, "completions/mean_terminated_length": 336.1785888671875, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.7537777777777778, "grad_norm": 0.5532026771970818, "kl": 0.05615234375, "learning_rate": 3.5930486355049254e-07, "loss": 0.1242, "num_tokens": 19985716.0, "reward": 0.3203125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 664.28125, "completions/mean_terminated_length": 401.7837829589844, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.7573333333333333, "grad_norm": 0.4307476904673574, "kl": 0.048736572265625, "learning_rate": 3.580069966606949e-07, "loss": 0.0757, "num_tokens": 20090116.0, "reward": 0.2890625, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 513.3203125, "completions/mean_terminated_length": 356.9897766113281, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.7608888888888888, "grad_norm": 0.4234573989995947, "kl": 0.052581787109375, "learning_rate": 3.5670554090336804e-07, "loss": 0.0521, "num_tokens": 20175261.0, "reward": 0.37890625, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 740.0, "completions/mean_length": 545.6796875, "completions/mean_terminated_length": 312.0813903808594, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.7644444444444445, "grad_norm": 0.3234877858824125, "kl": 0.050201416015625, "learning_rate": 3.55400539523417e-07, "loss": 0.0149, "num_tokens": 20264476.0, "reward": 0.328125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 627.234375, "completions/mean_terminated_length": 381.1392517089844, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.768, "grad_norm": 0.5137658406847142, "kl": 0.04913330078125, "learning_rate": 3.5409203588356096e-07, "loss": 0.1093, "num_tokens": 20364206.0, "reward": 0.30078125, "reward_std": 0.0973757952451706, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 542.7109375, "completions/mean_terminated_length": 361.5806579589844, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.7715555555555556, "grad_norm": 0.37475836395367995, "kl": 0.054840087890625, "learning_rate": 3.527800734628927e-07, "loss": 0.0666, "num_tokens": 20453017.0, "reward": 0.34375, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 544.453125, "completions/mean_terminated_length": 326.477294921875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.7751111111111111, "grad_norm": 0.5210418686475017, "kl": 0.054595947265625, "learning_rate": 3.5146469585543386e-07, "loss": 0.0881, "num_tokens": 20542023.0, "reward": 0.33203125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 586.5546875, "completions/mean_terminated_length": 380.40228271484375, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.7786666666666666, "grad_norm": 0.384918284625868, "kl": 0.056793212890625, "learning_rate": 3.501459467686859e-07, "loss": 0.0451, "num_tokens": 20636502.0, "reward": 0.3359375, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 497.1875, "completions/mean_terminated_length": 356.3564147949219, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.7822222222222223, "grad_norm": 0.33762362841517346, "kl": 0.04901123046875, "learning_rate": 3.4882387002217837e-07, "loss": 0.0219, "num_tokens": 20719442.0, "reward": 0.3828125, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 633.0546875, "completions/mean_terminated_length": 365.5657958984375, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.7857777777777778, "grad_norm": 0.45356214921777943, "kl": 0.04791259765625, "learning_rate": 3.474985095460127e-07, "loss": 0.0828, "num_tokens": 20819957.0, "reward": 0.28515625, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 562.9453125, "completions/mean_terminated_length": 337.7790832519531, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.7893333333333333, "grad_norm": 0.5621142052168658, "kl": 0.050506591796875, "learning_rate": 3.4616990937940207e-07, "loss": 0.0988, "num_tokens": 20911450.0, "reward": 0.30859375, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 524.5234375, "completions/mean_terminated_length": 336.54840087890625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.7928888888888889, "grad_norm": 0.39732009217015, "kl": 0.049652099609375, "learning_rate": 3.448381136692089e-07, "loss": 0.0495, "num_tokens": 20997925.0, "reward": 0.34765625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 558.703125, "completions/mean_terminated_length": 390.40423583984375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.7964444444444444, "grad_norm": 0.4615201081505346, "kl": 0.051849365234375, "learning_rate": 3.435031666684771e-07, "loss": 0.0795, "num_tokens": 21088779.0, "reward": 0.35546875, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 661.3125, "completions/mean_terminated_length": 370.1408386230469, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8, "grad_norm": 0.3794289377306595, "kl": 0.044342041015625, "learning_rate": 3.421651127349622e-07, "loss": 0.0311, "num_tokens": 21192875.0, "reward": 0.26953125, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 516.953125, "completions/mean_terminated_length": 354.9071960449219, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.8035555555555556, "grad_norm": 0.5213831705598811, "kl": 0.051177978515625, "learning_rate": 3.4082399632965696e-07, "loss": 0.0879, "num_tokens": 21278393.0, "reward": 0.37109375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 551.859375, "completions/mean_terminated_length": 344.96630859375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8071111111111111, "grad_norm": 0.5384745403639236, "kl": 0.04705810546875, "learning_rate": 3.394798620153147e-07, "loss": 0.0955, "num_tokens": 21368395.0, "reward": 0.33203125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 598.703125, "completions/mean_terminated_length": 432.2826232910156, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.8106666666666666, "grad_norm": 0.27168571682674436, "kl": 0.056121826171875, "learning_rate": 3.3813275445496766e-07, "loss": 0.0281, "num_tokens": 21464437.0, "reward": 0.3515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 516.84375, "completions/mean_terminated_length": 368.2828369140625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8142222222222222, "grad_norm": 0.4918864702245254, "kl": 0.048919677734375, "learning_rate": 3.367827184104437e-07, "loss": 0.0748, "num_tokens": 21549981.0, "reward": 0.37890625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 617.1015625, "completions/mean_terminated_length": 403.96429443359375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.8177777777777778, "grad_norm": 0.4281015074032974, "kl": 0.04840087890625, "learning_rate": 3.354297987408784e-07, "loss": 0.0553, "num_tokens": 21648362.0, "reward": 0.33203125, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 659.09375, "completions/mean_terminated_length": 425.17950439453125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8213333333333334, "grad_norm": 0.39276027009478315, "kl": 0.04217529296875, "learning_rate": 3.340740404012251e-07, "loss": 0.0699, "num_tokens": 21752226.0, "reward": 0.2890625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 571.640625, "completions/mean_terminated_length": 326.3855285644531, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.8248888888888889, "grad_norm": 0.24553332449976523, "kl": 0.047576904296875, "learning_rate": 3.3271548844076034e-07, "loss": 0.0276, "num_tokens": 21844776.0, "reward": 0.32421875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 588.265625, "completions/mean_terminated_length": 360.0238037109375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8284444444444444, "grad_norm": 0.4192334305516584, "kl": 0.046722412109375, "learning_rate": 3.313541880015877e-07, "loss": 0.0758, "num_tokens": 21939514.0, "reward": 0.32421875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 547.546875, "completions/mean_terminated_length": 353.8241882324219, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.832, "grad_norm": 0.4340402629653225, "kl": 0.0489501953125, "learning_rate": 3.299901843171374e-07, "loss": 0.0769, "num_tokens": 22029004.0, "reward": 0.359375, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 578.9140625, "completions/mean_terminated_length": 353.7529602050781, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.8355555555555556, "grad_norm": 0.4550605055318035, "kl": 0.044708251953125, "learning_rate": 3.2862352271066324e-07, "loss": 0.1018, "num_tokens": 22122537.0, "reward": 0.32421875, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 659.421875, "completions/mean_terminated_length": 440.6750183105469, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.8391111111111111, "grad_norm": 0.2445248101106762, "kl": 0.043487548828125, "learning_rate": 3.272542485937368e-07, "loss": 0.0168, "num_tokens": 22226391.0, "reward": 0.296875, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 564.921875, "completions/mean_terminated_length": 363.7528076171875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.8426666666666667, "grad_norm": 0.43451792973280545, "kl": 0.043853759765625, "learning_rate": 3.2588240746473866e-07, "loss": 0.0454, "num_tokens": 22318085.0, "reward": 0.33984375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 897.0, "completions/mean_length": 522.15625, "completions/mean_terminated_length": 381.6399841308594, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8462222222222222, "grad_norm": 0.46097223321182623, "kl": 0.044464111328125, "learning_rate": 3.245080449073459e-07, "loss": 0.0558, "num_tokens": 22404345.0, "reward": 0.375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 730.25, "completions/mean_terminated_length": 386.71185302734375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.8497777777777777, "grad_norm": 0.2789008683867169, "kl": 0.041046142578125, "learning_rate": 3.231312065890183e-07, "loss": 0.0253, "num_tokens": 22517245.0, "reward": 0.21484375, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.4296875, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 542.9296875, "completions/mean_terminated_length": 361.8817138671875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.8533333333333334, "grad_norm": 0.40992825759253854, "kl": 0.045440673828125, "learning_rate": 3.217519382594801e-07, "loss": 0.0497, "num_tokens": 22606152.0, "reward": 0.35546875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5078125, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 689.9609375, "completions/mean_terminated_length": 345.3174743652344, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8568888888888889, "grad_norm": 0.3509374457556898, "kl": 0.042083740234375, "learning_rate": 3.203702857492005e-07, "loss": 0.0698, "num_tokens": 22713879.0, "reward": 0.23828125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.4765625, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 682.15625, "completions/mean_terminated_length": 380.5294189453125, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.8604444444444445, "grad_norm": 0.43431331969026526, "kl": 0.0428466796875, "learning_rate": 3.189862949678704e-07, "loss": 0.0842, "num_tokens": 22820663.0, "reward": 0.25390625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 632.2890625, "completions/mean_terminated_length": 381.19232177734375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.864, "grad_norm": 0.2645645652285494, "kl": 0.040313720703125, "learning_rate": 3.1760001190287695e-07, "loss": 0.0302, "num_tokens": 22921096.0, "reward": 0.29296875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 603.078125, "completions/mean_terminated_length": 350.5249938964844, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.8675555555555555, "grad_norm": 0.3939912042247512, "kl": 0.04498291015625, "learning_rate": 3.162114826177756e-07, "loss": 0.0723, "num_tokens": 23017690.0, "reward": 0.296875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 565.1640625, "completions/mean_terminated_length": 412.21875, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.8711111111111111, "grad_norm": 0.3795987284244153, "kl": 0.046722412109375, "learning_rate": 3.148207532507595e-07, "loss": 0.0567, "num_tokens": 23109391.0, "reward": 0.35546875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 564.234375, "completions/mean_terminated_length": 362.7640380859375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8746666666666667, "grad_norm": 0.3098450486569573, "kl": 0.046661376953125, "learning_rate": 3.134278700131262e-07, "loss": 0.0532, "num_tokens": 23200985.0, "reward": 0.34375, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 636.2578125, "completions/mean_terminated_length": 387.70513916015625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.8782222222222222, "grad_norm": 0.36160488184511425, "kl": 0.046173095703125, "learning_rate": 3.1203287918774224e-07, "loss": 0.0643, "num_tokens": 23301874.0, "reward": 0.3046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 556.046875, "completions/mean_terminated_length": 365.7802429199219, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.8817777777777778, "grad_norm": 0.3725063522346133, "kl": 0.04913330078125, "learning_rate": 3.106358271275056e-07, "loss": 0.0622, "num_tokens": 23392400.0, "reward": 0.35546875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 678.2734375, "completions/mean_terminated_length": 391.8143005371094, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.8853333333333333, "grad_norm": 0.4365105008338742, "kl": 0.041229248046875, "learning_rate": 3.0923676025380483e-07, "loss": 0.0738, "num_tokens": 23498723.0, "reward": 0.25390625, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 607.28125, "completions/mean_terminated_length": 431.3333435058594, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.8888888888888888, "grad_norm": 0.5131341708057733, "kl": 0.045074462890625, "learning_rate": 3.078357250549772e-07, "loss": 0.0714, "num_tokens": 23595975.0, "reward": 0.35546875, "reward_std": 0.09077188372612, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 554.6875, "completions/mean_terminated_length": 333.5172424316406, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.8924444444444445, "grad_norm": 0.3066207203806433, "kl": 0.046295166015625, "learning_rate": 3.064327680847635e-07, "loss": 0.0333, "num_tokens": 23686307.0, "reward": 0.33984375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 541.0, "completions/mean_terminated_length": 337.0666809082031, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.896, "grad_norm": 0.1955617110849147, "kl": 0.04547119140625, "learning_rate": 3.0502793596076136e-07, "loss": 0.0149, "num_tokens": 23774891.0, "reward": 0.34765625, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 570.5078125, "completions/mean_terminated_length": 386.1208801269531, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.8995555555555556, "grad_norm": 0.4060044739473163, "kl": 0.047027587890625, "learning_rate": 3.0362127536287636e-07, "loss": 0.0357, "num_tokens": 23867356.0, "reward": 0.34765625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 886.0, "completions/mean_length": 605.8984375, "completions/mean_terminated_length": 328.9740295410156, "completions/min_length": 29.0, "completions/min_terminated_length": 29.0, "epoch": 0.9031111111111111, "grad_norm": 0.4031612054383529, "kl": 0.045928955078125, "learning_rate": 3.022128330317705e-07, "loss": 0.0415, "num_tokens": 23964299.0, "reward": 0.27734375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 666.28125, "completions/mean_terminated_length": 350.6470642089844, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.9066666666666666, "grad_norm": 0.28389952782427763, "kl": 0.04534912109375, "learning_rate": 3.0080265576730977e-07, "loss": 0.0517, "num_tokens": 24069051.0, "reward": 0.265625, "reward_std": 0.04268829524517059, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 652.28125, "completions/mean_terminated_length": 381.02703857421875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.9102222222222223, "grad_norm": 0.30245210685630924, "kl": 0.044219970703125, "learning_rate": 2.993907904270084e-07, "loss": 0.0472, "num_tokens": 24171971.0, "reward": 0.27734375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 633.6484375, "completions/mean_terminated_length": 357.8000183105469, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9137777777777778, "grad_norm": 0.42516385348562963, "kl": 0.0404052734375, "learning_rate": 2.979772839244723e-07, "loss": 0.0724, "num_tokens": 24272542.0, "reward": 0.28515625, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 558.9453125, "completions/mean_terminated_length": 355.1573181152344, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.9173333333333333, "grad_norm": 0.49497445593786343, "kl": 0.046112060546875, "learning_rate": 2.965621832278401e-07, "loss": 0.0863, "num_tokens": 24363507.0, "reward": 0.33984375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.125, "completions/max_length": 1024.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 392.078125, "completions/mean_terminated_length": 301.8035888671875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.9208888888888889, "grad_norm": 0.2700428014220197, "kl": 0.05169677734375, "learning_rate": 2.951455353582224e-07, "loss": 0.0321, "num_tokens": 24432977.0, "reward": 0.43359375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.8671875, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 546.34375, "completions/mean_terminated_length": 393.6907043457031, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.9244444444444444, "grad_norm": 0.6093825303034028, "kl": 0.0474853515625, "learning_rate": 2.937273873881396e-07, "loss": 0.0644, "num_tokens": 24522225.0, "reward": 0.3671875, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 528.453125, "completions/mean_terminated_length": 334.5434875488281, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.928, "grad_norm": 0.3626342261168496, "kl": 0.0506591796875, "learning_rate": 2.9230778643995724e-07, "loss": 0.0541, "num_tokens": 24609283.0, "reward": 0.3359375, "reward_std": 0.04268829524517059, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 598.5234375, "completions/mean_terminated_length": 398.0115051269531, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.9315555555555556, "grad_norm": 0.5508627532091885, "kl": 0.048248291015625, "learning_rate": 2.90886779684321e-07, "loss": 0.0783, "num_tokens": 24705262.0, "reward": 0.32421875, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 679.546875, "completions/mean_terminated_length": 403.0140686035156, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.9351111111111111, "grad_norm": 0.5189251679804696, "kl": 0.04156494140625, "learning_rate": 2.894644143385885e-07, "loss": 0.0875, "num_tokens": 24811724.0, "reward": 0.2734375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 523.3984375, "completions/mean_terminated_length": 342.32977294921875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.9386666666666666, "grad_norm": 0.290296980161375, "kl": 0.044830322265625, "learning_rate": 2.8804073766526095e-07, "loss": 0.042, "num_tokens": 24898107.0, "reward": 0.3515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 589.546875, "completions/mean_terminated_length": 337.456787109375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.9422222222222222, "grad_norm": 0.4755156136561332, "kl": 0.04620361328125, "learning_rate": 2.866157969704125e-07, "loss": 0.0858, "num_tokens": 24992961.0, "reward": 0.3125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 494.71875, "completions/mean_terminated_length": 366.25244140625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9457777777777778, "grad_norm": 0.47791478683979227, "kl": 0.043212890625, "learning_rate": 2.851896396021181e-07, "loss": 0.0431, "num_tokens": 25075649.0, "reward": 0.39453125, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 557.2265625, "completions/mean_terminated_length": 329.2674255371094, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.9493333333333334, "grad_norm": 0.37948128665963876, "kl": 0.046234130859375, "learning_rate": 2.837623129488808e-07, "loss": 0.0591, "num_tokens": 25166282.0, "reward": 0.328125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 519.671875, "completions/mean_terminated_length": 337.25531005859375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.9528888888888889, "grad_norm": 0.49689743036636796, "kl": 0.049285888671875, "learning_rate": 2.823338644380566e-07, "loss": 0.0988, "num_tokens": 25252204.0, "reward": 0.3671875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 573.2265625, "completions/mean_terminated_length": 328.831298828125, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.9564444444444444, "grad_norm": 0.45892332878810566, "kl": 0.04608154296875, "learning_rate": 2.809043415342784e-07, "loss": 0.0615, "num_tokens": 25344925.0, "reward": 0.328125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 615.921875, "completions/mean_terminated_length": 371.07501220703125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.96, "grad_norm": 0.3741655643569301, "kl": 0.046173095703125, "learning_rate": 2.794737917378797e-07, "loss": 0.0702, "num_tokens": 25443159.0, "reward": 0.30078125, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 580.6328125, "completions/mean_terminated_length": 340.25299072265625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.9635555555555556, "grad_norm": 0.3464055383797514, "kl": 0.04449462890625, "learning_rate": 2.780422625833153e-07, "loss": 0.0637, "num_tokens": 25536840.0, "reward": 0.3203125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 557.4921875, "completions/mean_terminated_length": 360.522216796875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.9671111111111111, "grad_norm": 0.5207686458939994, "kl": 0.046417236328125, "learning_rate": 2.766098016375823e-07, "loss": 0.0684, "num_tokens": 25627571.0, "reward": 0.33203125, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 555.265625, "completions/mean_terminated_length": 326.3488464355469, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.9706666666666667, "grad_norm": 0.3709023295132443, "kl": 0.04400634765625, "learning_rate": 2.751764564986396e-07, "loss": 0.0559, "num_tokens": 25718013.0, "reward": 0.33203125, "reward_std": 0.04147969186306, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 539.6171875, "completions/mean_terminated_length": 357.32257080078125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.9742222222222222, "grad_norm": 0.32275455170934847, "kl": 0.048248291015625, "learning_rate": 2.737422747938259e-07, "loss": 0.0215, "num_tokens": 25806492.0, "reward": 0.3515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4609375, "completions/max_length": 1024.0, "completions/max_terminated_length": 875.0, "completions/mean_length": 668.8515625, "completions/mean_terminated_length": 365.1739196777344, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.9777777777777777, "grad_norm": 0.3929601201245574, "kl": 0.0447998046875, "learning_rate": 2.723073041782776e-07, "loss": 0.0675, "num_tokens": 25911577.0, "reward": 0.25390625, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 487.140625, "completions/mean_terminated_length": 343.6237487792969, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.9813333333333333, "grad_norm": 0.40814793544358813, "kl": 0.049652099609375, "learning_rate": 2.708715923333451e-07, "loss": 0.0601, "num_tokens": 25993207.0, "reward": 0.37890625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 557.0390625, "completions/mean_terminated_length": 320.811767578125, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.9848888888888889, "grad_norm": 0.4111161310146508, "kl": 0.045013427734375, "learning_rate": 2.6943518696500835e-07, "loss": 0.0355, "num_tokens": 26083880.0, "reward": 0.32421875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 528.296875, "completions/mean_terminated_length": 349.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.9884444444444445, "grad_norm": 0.4967029882704871, "kl": 0.0491943359375, "learning_rate": 2.6799813580229174e-07, "loss": 0.0895, "num_tokens": 26170854.0, "reward": 0.3671875, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 539.6640625, "completions/mean_terminated_length": 350.14129638671875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.992, "grad_norm": 0.5353298538494163, "kl": 0.04522705078125, "learning_rate": 2.6656048659567834e-07, "loss": 0.0857, "num_tokens": 26259303.0, "reward": 0.3515625, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 573.546875, "completions/mean_terminated_length": 368.79547119140625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.9955555555555555, "grad_norm": 0.5244849742334187, "kl": 0.043609619140625, "learning_rate": 2.65122287115523e-07, "loss": 0.124, "num_tokens": 26352129.0, "reward": 0.3359375, "reward_std": 0.10232105851173401, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 581.0499877929688, "completions/mean_terminated_length": 315.2799987792969, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.9991111111111111, "grad_norm": 0.45761875244517275, "kl": 0.0584716796875, "learning_rate": 2.63683585150465e-07, "loss": 0.0424, "num_tokens": 26449905.0, "reward": 0.29296875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 570.0546875, "completions/mean_terminated_length": 363.7159118652344, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.0035555555555555, "grad_norm": 0.4093321100955424, "kl": 0.0482177734375, "learning_rate": 2.622444285058404e-07, "loss": 0.0711, "num_tokens": 26542300.0, "reward": 0.33203125, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 636.265625, "completions/mean_terminated_length": 418.7560729980469, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.007111111111111, "grad_norm": 0.4464206707169397, "kl": 0.04754638671875, "learning_rate": 2.6080486500209347e-07, "loss": 0.052, "num_tokens": 26643110.0, "reward": 0.29296875, "reward_std": 0.09737578779459, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 489.8359375, "completions/mean_terminated_length": 319.1236877441406, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.0106666666666666, "grad_norm": 0.4051251644840858, "kl": 0.050445556640625, "learning_rate": 2.5936494247318733e-07, "loss": 0.0483, "num_tokens": 26725129.0, "reward": 0.375, "reward_std": 0.04400775954127312, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 671.859375, "completions/mean_terminated_length": 380.0857238769531, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.0142222222222221, "grad_norm": 0.2915789701664038, "kl": 0.045318603515625, "learning_rate": 2.5792470876501517e-07, "loss": 0.0438, "num_tokens": 26830591.0, "reward": 0.26953125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 590.3515625, "completions/mean_terminated_length": 385.9884948730469, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 1.0177777777777777, "grad_norm": 0.5892244535918393, "kl": 0.04400634765625, "learning_rate": 2.5648421173380974e-07, "loss": 0.07, "num_tokens": 26925532.0, "reward": 0.3125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 613.09375, "completions/mean_terminated_length": 349.69232177734375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.0213333333333334, "grad_norm": 0.5188484080967409, "kl": 0.04583740234375, "learning_rate": 2.550434992445538e-07, "loss": 0.1134, "num_tokens": 27023444.0, "reward": 0.29296875, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 515.8046875, "completions/mean_terminated_length": 353.3917541503906, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.024888888888889, "grad_norm": 0.4453687940239156, "kl": 0.04742431640625, "learning_rate": 2.536026191693893e-07, "loss": 0.0717, "num_tokens": 27108851.0, "reward": 0.3671875, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 676.203125, "completions/mean_terminated_length": 369.3235168457031, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.0284444444444445, "grad_norm": 0.28667893508980163, "kl": 0.044830322265625, "learning_rate": 2.521616193860266e-07, "loss": 0.0293, "num_tokens": 27214877.0, "reward": 0.25390625, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.5078125, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 595.765625, "completions/mean_terminated_length": 371.452392578125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.032, "grad_norm": 0.3629373420467616, "kl": 0.044952392578125, "learning_rate": 2.507205477761539e-07, "loss": 0.0736, "num_tokens": 27310543.0, "reward": 0.32421875, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 491.3671875, "completions/mean_terminated_length": 313.82293701171875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 1.0355555555555556, "grad_norm": 0.28127151582356813, "kl": 0.04888916015625, "learning_rate": 2.4927945222384613e-07, "loss": 0.043, "num_tokens": 27392862.0, "reward": 0.36328125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 514.328125, "completions/mean_terminated_length": 322.5161437988281, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 1.039111111111111, "grad_norm": 0.532248491417988, "kl": 0.049530029296875, "learning_rate": 2.4783838061397334e-07, "loss": 0.1193, "num_tokens": 27478048.0, "reward": 0.36328125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 543.2890625, "completions/mean_terminated_length": 332.64044189453125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.0426666666666666, "grad_norm": 0.4672207713046188, "kl": 0.05322265625, "learning_rate": 2.4639738083061073e-07, "loss": 0.1004, "num_tokens": 27566909.0, "reward": 0.34765625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 628.0, "completions/mean_terminated_length": 374.1538391113281, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.0462222222222222, "grad_norm": 0.4817826517835868, "kl": 0.04803466796875, "learning_rate": 2.4495650075544613e-07, "loss": 0.0936, "num_tokens": 27666733.0, "reward": 0.296875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 618.3203125, "completions/mean_terminated_length": 340.75, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.0497777777777777, "grad_norm": 0.21389402727274193, "kl": 0.049835205078125, "learning_rate": 2.435157882661903e-07, "loss": 0.0345, "num_tokens": 27765334.0, "reward": 0.29296875, "reward_std": 0.025854695588350296, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 557.0625, "completions/mean_terminated_length": 374.34783935546875, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.0533333333333332, "grad_norm": 0.4372388295180305, "kl": 0.052398681640625, "learning_rate": 2.420752912349848e-07, "loss": 0.0275, "num_tokens": 27856066.0, "reward": 0.35546875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 555.0625, "completions/mean_terminated_length": 405.19586181640625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 1.056888888888889, "grad_norm": 0.6307617605623742, "kl": 0.068359375, "learning_rate": 2.4063505752681265e-07, "loss": 0.0643, "num_tokens": 27946542.0, "reward": 0.3671875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 623.46875, "completions/mean_terminated_length": 406.313232421875, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.0604444444444445, "grad_norm": 0.3761749271379481, "kl": 0.046630859375, "learning_rate": 2.3919513499790646e-07, "loss": 0.0265, "num_tokens": 28045774.0, "reward": 0.3046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 610.25, "completions/mean_terminated_length": 408.18603515625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.064, "grad_norm": 0.5661687442556961, "kl": 0.0487060546875, "learning_rate": 2.3775557149415953e-07, "loss": 0.0941, "num_tokens": 28143366.0, "reward": 0.30859375, "reward_std": 0.09979298710823059, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 538.0078125, "completions/mean_terminated_length": 362.223388671875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.0675555555555556, "grad_norm": 0.5156628665994852, "kl": 0.0518798828125, "learning_rate": 2.3631641484953493e-07, "loss": 0.065, "num_tokens": 28231555.0, "reward": 0.36328125, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 529.0234375, "completions/mean_terminated_length": 364.03125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.0711111111111111, "grad_norm": 0.4166986193229658, "kl": 0.051849365234375, "learning_rate": 2.3487771288447703e-07, "loss": 0.0583, "num_tokens": 28318662.0, "reward": 0.375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 596.421875, "completions/mean_terminated_length": 331.2152099609375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.0746666666666667, "grad_norm": 0.3751339423918947, "kl": 0.0516357421875, "learning_rate": 2.3343951340432158e-07, "loss": 0.0725, "num_tokens": 28414424.0, "reward": 0.3046875, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 543.8125, "completions/mean_terminated_length": 390.3504943847656, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.0782222222222222, "grad_norm": 0.43551662202895486, "kl": 0.050445556640625, "learning_rate": 2.3200186419770823e-07, "loss": 0.0751, "num_tokens": 28503456.0, "reward": 0.3671875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 598.0, "completions/mean_terminated_length": 333.77215576171875, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.0817777777777777, "grad_norm": 0.4161624163853555, "kl": 0.051910400390625, "learning_rate": 2.3056481303499163e-07, "loss": 0.0564, "num_tokens": 28599324.0, "reward": 0.30859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 496.40625, "completions/mean_terminated_length": 368.3495178222656, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.0853333333333333, "grad_norm": 0.41083839068544, "kl": 0.053131103515625, "learning_rate": 2.291284076666549e-07, "loss": 0.0369, "num_tokens": 28682220.0, "reward": 0.39453125, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 571.8359375, "completions/mean_terminated_length": 334.98809814453125, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 1.0888888888888888, "grad_norm": 0.44542347610615185, "kl": 0.05242919921875, "learning_rate": 2.2769269582172236e-07, "loss": 0.0308, "num_tokens": 28774787.0, "reward": 0.31640625, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 599.140625, "completions/mean_terminated_length": 326.79486083984375, "completions/min_length": 26.0, "completions/min_terminated_length": 26.0, "epoch": 1.0924444444444443, "grad_norm": 0.5145969287893023, "kl": 0.051727294921875, "learning_rate": 2.262577252061741e-07, "loss": 0.041, "num_tokens": 28870937.0, "reward": 0.296875, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 522.359375, "completions/mean_terminated_length": 310.5555725097656, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.096, "grad_norm": 0.48529408271351, "kl": 0.057861328125, "learning_rate": 2.2482354350136043e-07, "loss": 0.0704, "num_tokens": 28957175.0, "reward": 0.3359375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 589.5546875, "completions/mean_terminated_length": 345.8414611816406, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 1.0995555555555556, "grad_norm": 0.4000513186583522, "kl": 0.0537109375, "learning_rate": 2.2339019836241768e-07, "loss": 0.0245, "num_tokens": 29052042.0, "reward": 0.3125, "reward_std": 0.04268829524517059, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 935.0, "completions/mean_length": 622.078125, "completions/mean_terminated_length": 364.4359130859375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.1031111111111112, "grad_norm": 0.2797678064378038, "kl": 0.051025390625, "learning_rate": 2.219577374166847e-07, "loss": 0.0552, "num_tokens": 29151100.0, "reward": 0.296875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 640.6953125, "completions/mean_terminated_length": 402.9493713378906, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 1.1066666666666667, "grad_norm": 0.33279120678858987, "kl": 0.053070068359375, "learning_rate": 2.2052620826212031e-07, "loss": 0.039, "num_tokens": 29252561.0, "reward": 0.30078125, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 564.484375, "completions/mean_terminated_length": 347.9310302734375, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.1102222222222222, "grad_norm": 0.46225925011459174, "kl": 0.05450439453125, "learning_rate": 2.1909565846572158e-07, "loss": 0.0648, "num_tokens": 29344187.0, "reward": 0.32421875, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 581.609375, "completions/mean_terminated_length": 333.43902587890625, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.1137777777777778, "grad_norm": 0.4986793456455036, "kl": 0.053985595703125, "learning_rate": 2.1766613556194344e-07, "loss": 0.0651, "num_tokens": 29438033.0, "reward": 0.3203125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 487.4140625, "completions/mean_terminated_length": 337.16998291015625, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.1173333333333333, "grad_norm": 0.5300249795760147, "kl": 0.057647705078125, "learning_rate": 2.1623768705111914e-07, "loss": 0.0648, "num_tokens": 29519774.0, "reward": 0.38671875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 549.0625, "completions/mean_terminated_length": 355.9560546875, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 1.1208888888888888, "grad_norm": 0.27574687289754485, "kl": 0.054473876953125, "learning_rate": 2.1481036039788185e-07, "loss": 0.0461, "num_tokens": 29609510.0, "reward": 0.34765625, "reward_std": 0.04389689117670059, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 660.4296875, "completions/mean_terminated_length": 395.12164306640625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.1244444444444444, "grad_norm": 0.40740853537128535, "kl": 0.050140380859375, "learning_rate": 2.133842030295875e-07, "loss": 0.0473, "num_tokens": 29713417.0, "reward": 0.2734375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 501.9375, "completions/mean_terminated_length": 355.7599792480469, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.1280000000000001, "grad_norm": 0.37419094794305535, "kl": 0.05401611328125, "learning_rate": 2.1195926233473905e-07, "loss": 0.0458, "num_tokens": 29797041.0, "reward": 0.3828125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 510.2578125, "completions/mean_terminated_length": 352.9897766113281, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.1315555555555556, "grad_norm": 0.4408493083502953, "kl": 0.053558349609375, "learning_rate": 2.105355856614115e-07, "loss": 0.068, "num_tokens": 29881738.0, "reward": 0.37109375, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 575.90625, "completions/mean_terminated_length": 379.5505676269531, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.1351111111111112, "grad_norm": 0.4049821981067842, "kl": 0.05596923828125, "learning_rate": 2.0911322031567907e-07, "loss": 0.062, "num_tokens": 29974926.0, "reward": 0.34375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 573.84375, "completions/mean_terminated_length": 346.1176452636719, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 1.1386666666666667, "grad_norm": 0.3495660524520386, "kl": 0.049224853515625, "learning_rate": 2.076922135600427e-07, "loss": 0.0516, "num_tokens": 30067766.0, "reward": 0.328125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 628.8359375, "completions/mean_terminated_length": 375.525634765625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.1422222222222222, "grad_norm": 0.3298483449172045, "kl": 0.04656982421875, "learning_rate": 2.0627261261186048e-07, "loss": 0.0432, "num_tokens": 30167661.0, "reward": 0.30078125, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 603.9296875, "completions/mean_terminated_length": 360.1851806640625, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 1.1457777777777778, "grad_norm": 0.2643268050519706, "kl": 0.054046630859375, "learning_rate": 2.0485446464177752e-07, "loss": 0.0229, "num_tokens": 30264412.0, "reward": 0.30859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 899.0, "completions/mean_length": 635.8203125, "completions/mean_terminated_length": 370.22369384765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.1493333333333333, "grad_norm": 0.6001863182857846, "kl": 0.0460205078125, "learning_rate": 2.034378167721599e-07, "loss": 0.0734, "num_tokens": 30365229.0, "reward": 0.2890625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 449.7109375, "completions/mean_terminated_length": 330.51885986328125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.1528888888888889, "grad_norm": 0.5452184666522992, "kl": 0.053009033203125, "learning_rate": 2.0202271607552766e-07, "loss": 0.0691, "num_tokens": 30442044.0, "reward": 0.40625, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.8046875, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 955.0, "completions/mean_length": 654.5390625, "completions/mean_terminated_length": 401.75, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 1.1564444444444444, "grad_norm": 0.40908258142194187, "kl": 0.04620361328125, "learning_rate": 2.006092095729916e-07, "loss": 0.0597, "num_tokens": 30545253.0, "reward": 0.28515625, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 521.34375, "completions/mean_terminated_length": 360.7010192871094, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.16, "grad_norm": 0.42482999616755596, "kl": 0.051055908203125, "learning_rate": 1.9919734423269018e-07, "loss": 0.0826, "num_tokens": 30631365.0, "reward": 0.3828125, "reward_std": 0.09703661501407623, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 610.9140625, "completions/mean_terminated_length": 409.1744079589844, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.1635555555555555, "grad_norm": 0.381795500545772, "kl": 0.04583740234375, "learning_rate": 1.9778716696822948e-07, "loss": 0.0442, "num_tokens": 30728994.0, "reward": 0.328125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 594.4765625, "completions/mean_terminated_length": 336.76251220703125, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.1671111111111112, "grad_norm": 0.3228601321652542, "kl": 0.046783447265625, "learning_rate": 1.9637872463712362e-07, "loss": 0.0656, "num_tokens": 30824483.0, "reward": 0.30859375, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 518.4921875, "completions/mean_terminated_length": 335.64892578125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 1.1706666666666667, "grad_norm": 0.49699323161032616, "kl": 0.0518798828125, "learning_rate": 1.9497206403923864e-07, "loss": 0.0779, "num_tokens": 30910222.0, "reward": 0.359375, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 529.484375, "completions/mean_terminated_length": 364.6458435058594, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.1742222222222223, "grad_norm": 0.6227806564961746, "kl": 0.050323486328125, "learning_rate": 1.9356723191523646e-07, "loss": 0.1297, "num_tokens": 30997400.0, "reward": 0.375, "reward_std": 0.07767495512962341, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 554.21875, "completions/mean_terminated_length": 377.4193420410156, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.1777777777777778, "grad_norm": 0.3320732323601273, "kl": 0.052886962890625, "learning_rate": 1.921642749450228e-07, "loss": 0.0272, "num_tokens": 31087736.0, "reward": 0.3515625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 579.296875, "completions/mean_terminated_length": 346.3571472167969, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.1813333333333333, "grad_norm": 0.45384220295566097, "kl": 0.0469970703125, "learning_rate": 1.9076323974619512e-07, "loss": 0.1037, "num_tokens": 31181242.0, "reward": 0.3125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 548.2109375, "completions/mean_terminated_length": 389.6145935058594, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.1848888888888889, "grad_norm": 0.5585368307899082, "kl": 0.04888916015625, "learning_rate": 1.8936417287249446e-07, "loss": 0.0908, "num_tokens": 31270789.0, "reward": 0.37109375, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 553.03125, "completions/mean_terminated_length": 306.3333435058594, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 1.1884444444444444, "grad_norm": 0.1644178043279491, "kl": 0.050506591796875, "learning_rate": 1.8796712081225774e-07, "loss": 0.0187, "num_tokens": 31360933.0, "reward": 0.3203125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 603.8203125, "completions/mean_terminated_length": 368.1097412109375, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 1.192, "grad_norm": 0.521691975239134, "kl": 0.047454833984375, "learning_rate": 1.8657212998687388e-07, "loss": 0.0411, "num_tokens": 31457570.0, "reward": 0.3203125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 548.4609375, "completions/mean_terminated_length": 383.2737121582031, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.1955555555555555, "grad_norm": 0.18900147660632116, "kl": 0.0516357421875, "learning_rate": 1.8517924674924046e-07, "loss": 0.0069, "num_tokens": 31547157.0, "reward": 0.36328125, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 593.015625, "completions/mean_terminated_length": 389.9080505371094, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.199111111111111, "grad_norm": 0.4676749196615072, "kl": 0.0489501953125, "learning_rate": 1.8378851738222439e-07, "loss": 0.0814, "num_tokens": 31642499.0, "reward": 0.33203125, "reward_std": 0.09979298710823059, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 603.9765625, "completions/mean_terminated_length": 307.1600036621094, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.2026666666666666, "grad_norm": 0.3946019835120493, "kl": 0.04449462890625, "learning_rate": 1.82399988097123e-07, "loss": 0.0519, "num_tokens": 31739232.0, "reward": 0.28515625, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 612.9453125, "completions/mean_terminated_length": 382.3536376953125, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 1.2062222222222223, "grad_norm": 0.41137008498148375, "kl": 0.049102783203125, "learning_rate": 1.8101370503212962e-07, "loss": 0.0573, "num_tokens": 31837177.0, "reward": 0.3046875, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.21875, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 502.0703125, "completions/mean_terminated_length": 355.92999267578125, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.2097777777777778, "grad_norm": 0.28921010077797954, "kl": 0.051605224609375, "learning_rate": 1.7962971425079946e-07, "loss": 0.046, "num_tokens": 31920806.0, "reward": 0.390625, "reward_std": 0.04268829524517059, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 608.4609375, "completions/mean_terminated_length": 383.1686706542969, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.2133333333333334, "grad_norm": 0.37579255808216155, "kl": 0.04388427734375, "learning_rate": 1.7824806174051994e-07, "loss": 0.0521, "num_tokens": 32018101.0, "reward": 0.3125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 849.0, "completions/mean_length": 595.8984375, "completions/mean_terminated_length": 330.3670959472656, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.216888888888889, "grad_norm": 0.4003843458313343, "kl": 0.048583984375, "learning_rate": 1.7686879341098172e-07, "loss": 0.0674, "num_tokens": 32113808.0, "reward": 0.30078125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 562.34375, "completions/mean_terminated_length": 344.7816162109375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 1.2204444444444444, "grad_norm": 0.38365461058372496, "kl": 0.050048828125, "learning_rate": 1.7549195509265407e-07, "loss": 0.0611, "num_tokens": 32205176.0, "reward": 0.328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 658.890625, "completions/mean_terminated_length": 293.78125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.224, "grad_norm": 0.31463814496323855, "kl": 0.04412841796875, "learning_rate": 1.7411759253526137e-07, "loss": 0.0356, "num_tokens": 32308902.0, "reward": 0.25, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.4921875, "rewards/equation_reward_func/std": 0.5019033551216125, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 666.9921875, "completions/mean_terminated_length": 351.98529052734375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.2275555555555555, "grad_norm": 0.4192245591896077, "kl": 0.043731689453125, "learning_rate": 1.7274575140626315e-07, "loss": 0.0242, "num_tokens": 32413729.0, "reward": 0.26171875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 547.5703125, "completions/mean_terminated_length": 353.8571472167969, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.231111111111111, "grad_norm": 0.4842711133919289, "kl": 0.0614013671875, "learning_rate": 1.713764772893368e-07, "loss": 0.0823, "num_tokens": 32503218.0, "reward": 0.34765625, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 793.0, "completions/mean_length": 592.703125, "completions/mean_terminated_length": 325.18988037109375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.2346666666666666, "grad_norm": 0.22433252073730925, "kl": 0.0487060546875, "learning_rate": 1.7000981568286263e-07, "loss": 0.0329, "num_tokens": 32598524.0, "reward": 0.3125, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 486.1328125, "completions/mean_terminated_length": 328.57574462890625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.2382222222222223, "grad_norm": 0.46877252839006406, "kl": 0.0484619140625, "learning_rate": 1.6864581199841226e-07, "loss": 0.0611, "num_tokens": 32680137.0, "reward": 0.39453125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7734375, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 506.3125, "completions/mean_terminated_length": 340.865966796875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.2417777777777779, "grad_norm": 0.38402261693088885, "kl": 0.054595947265625, "learning_rate": 1.6728451155923966e-07, "loss": 0.0809, "num_tokens": 32764265.0, "reward": 0.3828125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 556.6484375, "completions/mean_terminated_length": 387.60638427734375, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.2453333333333334, "grad_norm": 0.41263398349489333, "kl": 0.0552978515625, "learning_rate": 1.6592595959877493e-07, "loss": 0.0691, "num_tokens": 32854920.0, "reward": 0.375, "reward_std": 0.06865385919809341, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 900.0, "completions/mean_length": 660.0859375, "completions/mean_terminated_length": 402.9200134277344, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.248888888888889, "grad_norm": 0.5009619285988075, "kl": 0.04510498046875, "learning_rate": 1.6457020125912158e-07, "loss": 0.085, "num_tokens": 32958851.0, "reward": 0.2734375, "reward_std": 0.09329995512962341, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 911.0, "completions/mean_length": 643.421875, "completions/mean_terminated_length": 374.4800109863281, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.2524444444444445, "grad_norm": 0.49763404654696913, "kl": 0.04632568359375, "learning_rate": 1.6321728158955633e-07, "loss": 0.0851, "num_tokens": 33060593.0, "reward": 0.296875, "reward_std": 0.08768948912620544, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 917.0, "completions/mean_length": 440.1328125, "completions/mean_terminated_length": 318.95281982421875, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.256, "grad_norm": 0.43280364990400033, "kl": 0.05126953125, "learning_rate": 1.6186724554503237e-07, "loss": 0.0315, "num_tokens": 33136190.0, "reward": 0.41015625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.8046875, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 972.0, "completions/mean_length": 537.0390625, "completions/mean_terminated_length": 353.7742004394531, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.2595555555555555, "grad_norm": 0.44861768758478837, "kl": 0.046539306640625, "learning_rate": 1.6052013798468528e-07, "loss": 0.0866, "num_tokens": 33224307.0, "reward": 0.34765625, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 528.6484375, "completions/mean_terminated_length": 342.2257995605469, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.263111111111111, "grad_norm": 0.49586029167018847, "kl": 0.045745849609375, "learning_rate": 1.5917600367034302e-07, "loss": 0.0573, "num_tokens": 33311354.0, "reward": 0.35546875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 569.7421875, "completions/mean_terminated_length": 347.8953552246094, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.2666666666666666, "grad_norm": 0.48498739472089847, "kl": 0.046600341796875, "learning_rate": 1.578348872650378e-07, "loss": 0.065, "num_tokens": 33403673.0, "reward": 0.33203125, "reward_std": 0.0859375, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 492.078125, "completions/mean_terminated_length": 329.2449035644531, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 1.2702222222222221, "grad_norm": 0.36121105153336, "kl": 0.05108642578125, "learning_rate": 1.564968333315229e-07, "loss": 0.0479, "num_tokens": 33485987.0, "reward": 0.3828125, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1014.0, "completions/mean_length": 579.0703125, "completions/mean_terminated_length": 361.7790832519531, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.2737777777777777, "grad_norm": 0.3901767984094118, "kl": 0.045166015625, "learning_rate": 1.5516188633079107e-07, "loss": 0.0936, "num_tokens": 33579540.0, "reward": 0.328125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 612.6953125, "completions/mean_terminated_length": 381.9634094238281, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.2773333333333334, "grad_norm": 0.49565931288789694, "kl": 0.0479736328125, "learning_rate": 1.5383009062059794e-07, "loss": 0.042, "num_tokens": 33677361.0, "reward": 0.30859375, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 596.453125, "completions/mean_terminated_length": 372.5, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.280888888888889, "grad_norm": 0.4352423934058431, "kl": 0.046142578125, "learning_rate": 1.525014904539873e-07, "loss": 0.0592, "num_tokens": 33773063.0, "reward": 0.328125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 623.0625, "completions/mean_terminated_length": 320.9862976074219, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 1.2844444444444445, "grad_norm": 0.41589511739999835, "kl": 0.0469970703125, "learning_rate": 1.5117612997782158e-07, "loss": 0.0729, "num_tokens": 33872179.0, "reward": 0.26171875, "reward_std": 0.05952189117670059, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 633.0703125, "completions/mean_terminated_length": 390.5949401855469, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.288, "grad_norm": 0.3818948579734981, "kl": 0.04827880859375, "learning_rate": 1.49854053231314e-07, "loss": 0.0303, "num_tokens": 33972628.0, "reward": 0.296875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 547.2421875, "completions/mean_terminated_length": 374.7978515625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.2915555555555556, "grad_norm": 0.4934807097913043, "kl": 0.05255126953125, "learning_rate": 1.4853530414456612e-07, "loss": 0.1276, "num_tokens": 34061983.0, "reward": 0.359375, "reward_std": 0.09198048710823059, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 594.859375, "completions/mean_terminated_length": 354.1219482421875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.295111111111111, "grad_norm": 0.35269264766875597, "kl": 0.0489501953125, "learning_rate": 1.4721992653710718e-07, "loss": 0.0646, "num_tokens": 34157553.0, "reward": 0.3125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 557.46875, "completions/mean_terminated_length": 360.4888916015625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.2986666666666666, "grad_norm": 0.38415028608884955, "kl": 0.04730224609375, "learning_rate": 1.45907964116439e-07, "loss": 0.0773, "num_tokens": 34248297.0, "reward": 0.34765625, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 968.0, "completions/mean_length": 608.484375, "completions/mean_terminated_length": 390.8333435058594, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.3022222222222222, "grad_norm": 0.4704827945908406, "kl": 0.0517578125, "learning_rate": 1.4459946047658305e-07, "loss": 0.0718, "num_tokens": 34345551.0, "reward": 0.30859375, "reward_std": 0.08175078779459, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 574.2109375, "completions/mean_terminated_length": 330.3493957519531, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.3057777777777777, "grad_norm": 0.3021618038785495, "kl": 0.048248291015625, "learning_rate": 1.4329445909663194e-07, "loss": 0.0237, "num_tokens": 34438414.0, "reward": 0.32421875, "reward_std": 0.0234375, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 476.9296875, "completions/mean_terminated_length": 330.68316650390625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 1.3093333333333335, "grad_norm": 0.30844179903870933, "kl": 0.048797607421875, "learning_rate": 1.4199300333930515e-07, "loss": 0.0278, "num_tokens": 34518853.0, "reward": 0.40234375, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.7890625, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 651.1328125, "completions/mean_terminated_length": 396.0131530761719, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.3128888888888888, "grad_norm": 0.4348121985912503, "kl": 0.046844482421875, "learning_rate": 1.4069513644950744e-07, "loss": 0.1033, "num_tokens": 34621626.0, "reward": 0.296875, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 585.5859375, "completions/mean_terminated_length": 363.8000183105469, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.3164444444444445, "grad_norm": 0.5691267571447906, "kl": 0.0498046875, "learning_rate": 1.394009015528927e-07, "loss": 0.1066, "num_tokens": 34715993.0, "reward": 0.32421875, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 556.265625, "completions/mean_terminated_length": 343.6590881347656, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.32, "grad_norm": 0.4232368305179094, "kl": 0.048553466796875, "learning_rate": 1.3811034165443036e-07, "loss": 0.0869, "num_tokens": 34806587.0, "reward": 0.34765625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 529.6484375, "completions/mean_terminated_length": 364.8645935058594, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.3235555555555556, "grad_norm": 0.553235689719908, "kl": 0.053802490234375, "learning_rate": 1.3682349963697676e-07, "loss": 0.0624, "num_tokens": 34893742.0, "reward": 0.3671875, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 627.3046875, "completions/mean_terminated_length": 389.2875061035156, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.3271111111111111, "grad_norm": 0.2964706380584295, "kl": 0.049407958984375, "learning_rate": 1.3554041825985e-07, "loss": 0.0381, "num_tokens": 34993413.0, "reward": 0.3125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 837.0, "completions/mean_length": 613.5234375, "completions/mean_terminated_length": 341.64935302734375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.3306666666666667, "grad_norm": 0.2563181478490565, "kl": 0.049285888671875, "learning_rate": 1.3426114015740915e-07, "loss": 0.0394, "num_tokens": 35091372.0, "reward": 0.29296875, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 631.625, "completions/mean_terminated_length": 388.253173828125, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 1.3342222222222222, "grad_norm": 0.5082084701623665, "kl": 0.05084228515625, "learning_rate": 1.3298570783763805e-07, "loss": 0.0473, "num_tokens": 35191708.0, "reward": 0.3046875, "reward_std": 0.07525776326656342, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 551.296875, "completions/mean_terminated_length": 312.1647033691406, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.3377777777777777, "grad_norm": 0.4114547526286328, "kl": 0.057098388671875, "learning_rate": 1.31714163680732e-07, "loss": 0.0621, "num_tokens": 35281638.0, "reward": 0.328125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 498.9375, "completions/mean_terminated_length": 365.0980529785156, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.3413333333333333, "grad_norm": 0.24726401581296917, "kl": 0.054595947265625, "learning_rate": 1.3044654993769044e-07, "loss": 0.0375, "num_tokens": 35364950.0, "reward": 0.39453125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.7890625, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 960.0, "completions/mean_length": 522.515625, "completions/mean_terminated_length": 326.2826232910156, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.3448888888888888, "grad_norm": 0.3753761855079249, "kl": 0.053375244140625, "learning_rate": 1.2918290872891236e-07, "loss": 0.0735, "num_tokens": 35451216.0, "reward": 0.3515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 903.0, "completions/mean_length": 592.2890625, "completions/mean_terminated_length": 315.5513000488281, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.3484444444444446, "grad_norm": 0.38683356576812983, "kl": 0.051910400390625, "learning_rate": 1.2792328204279712e-07, "loss": 0.0559, "num_tokens": 35546421.0, "reward": 0.29296875, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 591.03125, "completions/mean_terminated_length": 401.3033752441406, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 1.3519999999999999, "grad_norm": 0.6091935511875326, "kl": 0.05224609375, "learning_rate": 1.2666771173434892e-07, "loss": 0.0704, "num_tokens": 35641501.0, "reward": 0.33984375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 566.6796875, "completions/mean_terminated_length": 358.80682373046875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 1.3555555555555556, "grad_norm": 0.35005187805482085, "kl": 0.05487060546875, "learning_rate": 1.2541623952378655e-07, "loss": 0.0387, "num_tokens": 35733484.0, "reward": 0.32421875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 669.1640625, "completions/mean_terminated_length": 375.1571350097656, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 1.3591111111111112, "grad_norm": 0.34905731076141494, "kl": 0.05194091796875, "learning_rate": 1.2416890699515636e-07, "loss": 0.0654, "num_tokens": 35838625.0, "reward": 0.26171875, "reward_std": 0.09077189117670059, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 916.0, "completions/mean_length": 613.953125, "completions/mean_terminated_length": 359.6202697753906, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.3626666666666667, "grad_norm": 0.4514558086162918, "kl": 0.05438232421875, "learning_rate": 1.2292575559495143e-07, "loss": 0.088, "num_tokens": 35936643.0, "reward": 0.2890625, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 606.140625, "completions/mean_terminated_length": 338.28204345703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.3662222222222222, "grad_norm": 0.3303484328737797, "kl": 0.052032470703125, "learning_rate": 1.216868266307333e-07, "loss": 0.0176, "num_tokens": 36033617.0, "reward": 0.296875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 556.3046875, "completions/mean_terminated_length": 351.35955810546875, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.3697777777777778, "grad_norm": 0.3982447361138536, "kl": 0.05126953125, "learning_rate": 1.2045216126976054e-07, "loss": 0.049, "num_tokens": 36124268.0, "reward": 0.34375, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 636.0703125, "completions/mean_terminated_length": 324.6337890625, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 1.3733333333333333, "grad_norm": 0.42312400543526135, "kl": 0.050140380859375, "learning_rate": 1.1922180053761985e-07, "loss": 0.0428, "num_tokens": 36225085.0, "reward": 0.28125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 553.3359375, "completions/mean_terminated_length": 369.1630554199219, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 1.3768888888888888, "grad_norm": 0.4691279372381847, "kl": 0.05670166015625, "learning_rate": 1.1799578531686355e-07, "loss": 0.0656, "num_tokens": 36315312.0, "reward": 0.35546875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 577.1171875, "completions/mean_terminated_length": 373.9886474609375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.3804444444444444, "grad_norm": 0.38451829010368166, "kl": 0.051300048828125, "learning_rate": 1.1677415634565066e-07, "loss": 0.0442, "num_tokens": 36408599.0, "reward": 0.33984375, "reward_std": 0.06325855106115341, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 550.1953125, "completions/mean_terminated_length": 357.5494689941406, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 1.384, "grad_norm": 0.3348533935130239, "kl": 0.054718017578125, "learning_rate": 1.1555695421639369e-07, "loss": 0.0521, "num_tokens": 36498468.0, "reward": 0.34375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 528.90625, "completions/mean_terminated_length": 363.875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.3875555555555557, "grad_norm": 0.34734645763254457, "kl": 0.054656982421875, "learning_rate": 1.1434421937440927e-07, "loss": 0.0511, "num_tokens": 36585516.0, "reward": 0.36328125, "reward_std": 0.05050079524517059, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 544.2890625, "completions/mean_terminated_length": 334.0786437988281, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.3911111111111112, "grad_norm": 0.5226023770066429, "kl": 0.0576171875, "learning_rate": 1.1313599211657493e-07, "loss": 0.0686, "num_tokens": 36674553.0, "reward": 0.34375, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 591.9609375, "completions/mean_terminated_length": 332.7375183105469, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.3946666666666667, "grad_norm": 0.4646464238974429, "kl": 0.052276611328125, "learning_rate": 1.1193231258998933e-07, "loss": 0.0722, "num_tokens": 36769700.0, "reward": 0.3125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 871.0, "completions/mean_length": 600.078125, "completions/mean_terminated_length": 362.2682800292969, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.3982222222222223, "grad_norm": 0.39069433762532657, "kl": 0.0538330078125, "learning_rate": 1.1073322079063913e-07, "loss": 0.0622, "num_tokens": 36865918.0, "reward": 0.3125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 497.59375, "completions/mean_terminated_length": 336.448974609375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.4017777777777778, "grad_norm": 0.46411797290095597, "kl": 0.061126708984375, "learning_rate": 1.0953875656206896e-07, "loss": 0.0472, "num_tokens": 36948982.0, "reward": 0.3828125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 702.734375, "completions/mean_terminated_length": 460.6849365234375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 1.4053333333333333, "grad_norm": 0.361002855228315, "kl": 0.04974365234375, "learning_rate": 1.083489595940586e-07, "loss": 0.0593, "num_tokens": 37058396.0, "reward": 0.27734375, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 532.3125, "completions/mean_terminated_length": 347.2688293457031, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.4088888888888889, "grad_norm": 0.5451534782886297, "kl": 0.05438232421875, "learning_rate": 1.0716386942130312e-07, "loss": 0.1024, "num_tokens": 37145884.0, "reward": 0.3671875, "reward_std": 0.06865385919809341, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 647.171875, "completions/mean_terminated_length": 389.34210205078125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 1.4124444444444444, "grad_norm": 0.4383050780619703, "kl": 0.052093505859375, "learning_rate": 1.0598352542210021e-07, "loss": 0.0772, "num_tokens": 37248178.0, "reward": 0.2890625, "reward_std": 0.07525776326656342, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 571.265625, "completions/mean_terminated_length": 380.1111145019531, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.416, "grad_norm": 0.49744576073848973, "kl": 0.053924560546875, "learning_rate": 1.0480796681704077e-07, "loss": 0.1079, "num_tokens": 37340676.0, "reward": 0.3515625, "reward_std": 0.11607225239276886, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 583.0703125, "completions/mean_terminated_length": 352.1071472167969, "completions/min_length": 103.0, "completions/min_terminated_length": 103.0, "epoch": 1.4195555555555557, "grad_norm": 0.5201110119058623, "kl": 0.0570068359375, "learning_rate": 1.0363723266770649e-07, "loss": 0.0751, "num_tokens": 37434693.0, "reward": 0.33984375, "reward_std": 0.08548745512962341, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 582.484375, "completions/mean_terminated_length": 351.21429443359375, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 1.423111111111111, "grad_norm": 0.478950951032366, "kl": 0.058624267578125, "learning_rate": 1.0247136187537123e-07, "loss": 0.0584, "num_tokens": 37528643.0, "reward": 0.328125, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 609.078125, "completions/mean_terminated_length": 343.1025695800781, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.4266666666666667, "grad_norm": 0.5122183824085633, "kl": 0.057861328125, "learning_rate": 1.0131039317970907e-07, "loss": 0.1266, "num_tokens": 37626053.0, "reward": 0.3046875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 627.0546875, "completions/mean_terminated_length": 396.7283935546875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.4302222222222223, "grad_norm": 0.4102118823905047, "kl": 0.05316162109375, "learning_rate": 1.0015436515750636e-07, "loss": 0.0588, "num_tokens": 37725812.0, "reward": 0.3125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 601.1875, "completions/mean_terminated_length": 302.3999938964844, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.4337777777777778, "grad_norm": 0.211724766464037, "kl": 0.05120849609375, "learning_rate": 9.900331622138063e-08, "loss": 0.0192, "num_tokens": 37822184.0, "reward": 0.28515625, "reward_std": 0.016833597794175148, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 569.546875, "completions/mean_terminated_length": 355.3793029785156, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.4373333333333334, "grad_norm": 0.44972353199716397, "kl": 0.058135986328125, "learning_rate": 9.785728461850346e-08, "loss": 0.0148, "num_tokens": 37914482.0, "reward": 0.33203125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 614.46875, "completions/mean_terminated_length": 399.952392578125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 1.4408888888888889, "grad_norm": 0.4253099608195351, "kl": 0.05194091796875, "learning_rate": 9.671630842933027e-08, "loss": 0.0514, "num_tokens": 38012598.0, "reward": 0.3046875, "reward_std": 0.05831329524517059, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1008.0, "completions/mean_length": 636.125, "completions/mean_terminated_length": 379.22076416015625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.4444444444444444, "grad_norm": 0.33922794539403645, "kl": 0.05377197265625, "learning_rate": 9.558042556633439e-08, "loss": 0.0568, "num_tokens": 38113430.0, "reward": 0.29296875, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 559.40625, "completions/mean_terminated_length": 355.8202209472656, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.448, "grad_norm": 0.6508014224522416, "kl": 0.060089111328125, "learning_rate": 9.44496737727479e-08, "loss": 0.1441, "num_tokens": 38204394.0, "reward": 0.32421875, "reward_std": 0.10221018642187119, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 640.046875, "completions/mean_terminated_length": 368.7200012207031, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.4515555555555555, "grad_norm": 0.29783152624543474, "kl": 0.05450439453125, "learning_rate": 9.332409062130686e-08, "loss": 0.0423, "num_tokens": 38305796.0, "reward": 0.2890625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 475.21875, "completions/mean_terminated_length": 328.51483154296875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.455111111111111, "grad_norm": 0.6409980051318146, "kl": 0.055267333984375, "learning_rate": 9.220371351300352e-08, "loss": 0.1234, "num_tokens": 38386024.0, "reward": 0.390625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.7734375, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 583.7578125, "completions/mean_terminated_length": 353.1547546386719, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.4586666666666668, "grad_norm": 0.48264063824874753, "kl": 0.0601806640625, "learning_rate": 9.10885796758428e-08, "loss": 0.0697, "num_tokens": 38480149.0, "reward": 0.3203125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1640625, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 440.4375, "completions/mean_terminated_length": 325.9065246582031, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.462222222222222, "grad_norm": 0.5072300694040541, "kl": 0.05584716796875, "learning_rate": 8.997872616360603e-08, "loss": 0.072, "num_tokens": 38555821.0, "reward": 0.4140625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.828125, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 580.5, "completions/mean_terminated_length": 393.24444580078125, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.4657777777777778, "grad_norm": 0.5249295654451599, "kl": 0.05535888671875, "learning_rate": 8.887418985461903e-08, "loss": 0.1168, "num_tokens": 38649577.0, "reward": 0.33984375, "reward_std": 0.0973757952451706, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 952.0, "completions/mean_length": 523.2734375, "completions/mean_terminated_length": 303.85394287109375, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 1.4693333333333334, "grad_norm": 0.39162703968567836, "kl": 0.0574951171875, "learning_rate": 8.777500745052743e-08, "loss": 0.0642, "num_tokens": 38735872.0, "reward": 0.34765625, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 959.0, "completions/mean_length": 628.15625, "completions/mean_terminated_length": 398.4691467285156, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 1.472888888888889, "grad_norm": 0.48349752548114505, "kl": 0.058868408203125, "learning_rate": 8.668121547507634e-08, "loss": 0.0871, "num_tokens": 38835676.0, "reward": 0.29296875, "reward_std": 0.10639689117670059, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 579.78125, "completions/mean_terminated_length": 362.83721923828125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.4764444444444444, "grad_norm": 0.45657304605313564, "kl": 0.055023193359375, "learning_rate": 8.559285027289753e-08, "loss": 0.0409, "num_tokens": 38929288.0, "reward": 0.3125, "reward_std": 0.058313291519880295, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 995.0, "completions/mean_length": 651.0234375, "completions/mean_terminated_length": 351.591552734375, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.48, "grad_norm": 0.5676783277502544, "kl": 0.058746337890625, "learning_rate": 8.450994800830111e-08, "loss": 0.0807, "num_tokens": 39031971.0, "reward": 0.28125, "reward_std": 0.09329995512962341, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 783.0, "completions/mean_length": 552.5546875, "completions/mean_terminated_length": 296.9517822265625, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.4835555555555555, "grad_norm": 0.5423121681184915, "kl": 0.073150634765625, "learning_rate": 8.343254466407435e-08, "loss": 0.0402, "num_tokens": 39122122.0, "reward": 0.328125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 582.578125, "completions/mean_terminated_length": 359.2705993652344, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.487111111111111, "grad_norm": 0.3308406056720156, "kl": 0.05609130859375, "learning_rate": 8.236067604028562e-08, "loss": 0.0565, "num_tokens": 39216100.0, "reward": 0.31640625, "reward_std": 0.041479695588350296, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 544.4140625, "completions/mean_terminated_length": 377.8210754394531, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.4906666666666666, "grad_norm": 0.4108868028800648, "kl": 0.060760498046875, "learning_rate": 8.129437775309533e-08, "loss": 0.0931, "num_tokens": 39305189.0, "reward": 0.37109375, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 601.3125, "completions/mean_terminated_length": 356.04937744140625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.4942222222222221, "grad_norm": 0.4247109140676018, "kl": 0.0565185546875, "learning_rate": 8.023368523357182e-08, "loss": 0.0661, "num_tokens": 39401625.0, "reward": 0.3125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 969.0, "completions/mean_length": 625.8046875, "completions/mean_terminated_length": 409.9156494140625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.4977777777777779, "grad_norm": 0.3955389838477825, "kl": 0.05816650390625, "learning_rate": 7.917863372651476e-08, "loss": 0.0648, "num_tokens": 39501144.0, "reward": 0.3203125, "reward_std": 0.07107105106115341, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 581.2421875, "completions/mean_terminated_length": 357.2588195800781, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 1.5013333333333332, "grad_norm": 0.5552809767450916, "kl": 0.056549072265625, "learning_rate": 7.812925828928332e-08, "loss": 0.0908, "num_tokens": 39594931.0, "reward": 0.328125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 560.5390625, "completions/mean_terminated_length": 399.5473937988281, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 1.504888888888889, "grad_norm": 0.47738135651667435, "kl": 0.058197021484375, "learning_rate": 7.708559379063204e-08, "loss": 0.0961, "num_tokens": 39686076.0, "reward": 0.3671875, "reward_std": 0.08537658303976059, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 548.9765625, "completions/mean_terminated_length": 348.4111328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.5084444444444445, "grad_norm": 0.5791223341849459, "kl": 0.060302734375, "learning_rate": 7.604767490955138e-08, "loss": 0.0979, "num_tokens": 39775785.0, "reward": 0.35546875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 556.3671875, "completions/mean_terminated_length": 358.9222412109375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.512, "grad_norm": 0.48901925413841724, "kl": 0.07611083984375, "learning_rate": 7.501553613411626e-08, "loss": 0.0847, "num_tokens": 39866432.0, "reward": 0.34765625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 566.59375, "completions/mean_terminated_length": 327.0, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.5155555555555555, "grad_norm": 0.5304614791186009, "kl": 0.063720703125, "learning_rate": 7.398921176033928e-08, "loss": 0.1492, "num_tokens": 39958412.0, "reward": 0.32421875, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 629.015625, "completions/mean_terminated_length": 358.7631530761719, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.519111111111111, "grad_norm": 0.31610727376671266, "kl": 0.053924560546875, "learning_rate": 7.296873589103184e-08, "loss": 0.0602, "num_tokens": 40058334.0, "reward": 0.30078125, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1016.0, "completions/mean_length": 555.140625, "completions/mean_terminated_length": 371.6739196777344, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.5226666666666666, "grad_norm": 0.46266071912391504, "kl": 0.05865478515625, "learning_rate": 7.195414243467029e-08, "loss": 0.1176, "num_tokens": 40148772.0, "reward": 0.3515625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 919.0, "completions/mean_length": 568.265625, "completions/mean_terminated_length": 353.4942626953125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.5262222222222221, "grad_norm": 0.4926827661651201, "kl": 0.0615234375, "learning_rate": 7.094546510426994e-08, "loss": 0.0768, "num_tokens": 40240878.0, "reward": 0.3359375, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 514.578125, "completions/mean_terminated_length": 315.2391357421875, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 1.529777777777778, "grad_norm": 0.3307947277316019, "kl": 0.0595703125, "learning_rate": 6.994273741626405e-08, "loss": 0.0513, "num_tokens": 40326048.0, "reward": 0.3515625, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 567.0859375, "completions/mean_terminated_length": 335.9411926269531, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.5333333333333332, "grad_norm": 0.29575891901225226, "kl": 0.059051513671875, "learning_rate": 6.8945992689391e-08, "loss": 0.0167, "num_tokens": 40418031.0, "reward": 0.3203125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 582.1015625, "completions/mean_terminated_length": 381.2386474609375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 1.536888888888889, "grad_norm": 0.4773537181669013, "kl": 0.058441162109375, "learning_rate": 6.795526404358628e-08, "loss": 0.0658, "num_tokens": 40511904.0, "reward": 0.33203125, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 909.0, "completions/mean_length": 585.984375, "completions/mean_terminated_length": 323.1750183105469, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 1.5404444444444443, "grad_norm": 0.5218897184627993, "kl": 0.05926513671875, "learning_rate": 6.697058439888283e-08, "loss": 0.1136, "num_tokens": 40606286.0, "reward": 0.3125, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 567.5390625, "completions/mean_terminated_length": 352.42529296875, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.544, "grad_norm": 0.4715931511639487, "kl": 0.056793212890625, "learning_rate": 6.599198647431642e-08, "loss": 0.1108, "num_tokens": 40698375.0, "reward": 0.33203125, "reward_std": 0.0817507952451706, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1007.0, "completions/mean_length": 602.84375, "completions/mean_terminated_length": 332.8717956542969, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.5475555555555556, "grad_norm": 0.5199092549193227, "kl": 0.057220458984375, "learning_rate": 6.501950278683907e-08, "loss": 0.0746, "num_tokens": 40794923.0, "reward": 0.2890625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 628.96875, "completions/mean_terminated_length": 349.8133544921875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.551111111111111, "grad_norm": 0.40696172437097355, "kl": 0.053558349609375, "learning_rate": 6.405316565023805e-08, "loss": 0.0696, "num_tokens": 40894855.0, "reward": 0.28125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.5546875, "rewards/equation_reward_func/std": 0.4989531338214874, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 552.3984375, "completions/mean_terminated_length": 338.0340881347656, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.5546666666666666, "grad_norm": 0.30456198181801936, "kl": 0.061126708984375, "learning_rate": 6.309300717406274e-08, "loss": 0.0233, "num_tokens": 40984890.0, "reward": 0.3515625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 492.953125, "completions/mean_terminated_length": 323.2370910644531, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.5582222222222222, "grad_norm": 0.541360750567622, "kl": 0.059906005859375, "learning_rate": 6.213905926255697e-08, "loss": 0.0522, "num_tokens": 41067320.0, "reward": 0.37109375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 529.265625, "completions/mean_terminated_length": 343.07525634765625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.561777777777778, "grad_norm": 0.3988666309694946, "kl": 0.063018798828125, "learning_rate": 6.119135361359965e-08, "loss": 0.0414, "num_tokens": 41154446.0, "reward": 0.359375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 587.7109375, "completions/mean_terminated_length": 367.0, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.5653333333333332, "grad_norm": 0.5073575168624447, "kl": 0.057586669921875, "learning_rate": 6.024992171765089e-08, "loss": 0.083, "num_tokens": 41249081.0, "reward": 0.328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 580.3125, "completions/mean_terminated_length": 371.2183837890625, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.568888888888889, "grad_norm": 0.3903467673950676, "kl": 0.062744140625, "learning_rate": 5.9314794856705983e-08, "loss": 0.014, "num_tokens": 41342797.0, "reward": 0.3359375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 540.984375, "completions/mean_terminated_length": 344.5934143066406, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.5724444444444443, "grad_norm": 0.4461348226859665, "kl": 0.05853271484375, "learning_rate": 5.8386004103255975e-08, "loss": 0.036, "num_tokens": 41431419.0, "reward": 0.34375, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 616.0703125, "completions/mean_terminated_length": 371.3125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.576, "grad_norm": 0.4667331263109846, "kl": 0.062286376953125, "learning_rate": 5.7463580319254853e-08, "loss": 0.1041, "num_tokens": 41529720.0, "reward": 0.30859375, "reward_std": 0.11013355851173401, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 980.0, "completions/mean_length": 552.3125, "completions/mean_terminated_length": 360.5274963378906, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 1.5795555555555556, "grad_norm": 0.49927183159937233, "kl": 0.05731201171875, "learning_rate": 5.6547554155094626e-08, "loss": 0.1198, "num_tokens": 41619900.0, "reward": 0.3515625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 637.265625, "completions/mean_terminated_length": 355.0540771484375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 1.5831111111111111, "grad_norm": 0.42609209251294944, "kl": 0.0595703125, "learning_rate": 5.563795604858615e-08, "loss": 0.0793, "num_tokens": 41720862.0, "reward": 0.28515625, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 978.0, "completions/mean_length": 603.7890625, "completions/mean_terminated_length": 343.15191650390625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 1.5866666666666667, "grad_norm": 0.33644419692067673, "kl": 0.057373046875, "learning_rate": 5.473481622394849e-08, "loss": 0.0618, "num_tokens": 41817579.0, "reward": 0.30078125, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 599.296875, "completions/mean_terminated_length": 376.8333435058594, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 1.5902222222222222, "grad_norm": 0.2763792668243421, "kl": 0.062042236328125, "learning_rate": 5.3838164690803935e-08, "loss": 0.0347, "num_tokens": 41913649.0, "reward": 0.3203125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1796875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 461.6484375, "completions/mean_terminated_length": 338.4666748046875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.5937777777777777, "grad_norm": 0.39232035615820005, "kl": 0.063262939453125, "learning_rate": 5.294803124318145e-08, "loss": 0.0405, "num_tokens": 41992144.0, "reward": 0.41015625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.8125, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 545.0390625, "completions/mean_terminated_length": 385.38543701171875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.5973333333333333, "grad_norm": 0.34072310623207164, "kl": 0.061004638671875, "learning_rate": 5.20644454585262e-08, "loss": 0.0239, "num_tokens": 42081277.0, "reward": 0.3671875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 975.0, "completions/mean_length": 513.8515625, "completions/mean_terminated_length": 350.8144226074219, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 1.600888888888889, "grad_norm": 0.5025369359731723, "kl": 0.05792236328125, "learning_rate": 5.1187436696716906e-08, "loss": 0.0882, "num_tokens": 42166466.0, "reward": 0.36328125, "reward_std": 0.07514689117670059, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 575.9453125, "completions/mean_terminated_length": 357.1278991699219, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.6044444444444443, "grad_norm": 0.2627319272861186, "kl": 0.056121826171875, "learning_rate": 5.0317034099090524e-08, "loss": 0.0387, "num_tokens": 42259579.0, "reward": 0.3359375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 572.3671875, "completions/mean_terminated_length": 367.0795593261719, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.608, "grad_norm": 0.47988527495174843, "kl": 0.058807373046875, "learning_rate": 4.9453266587473423e-08, "loss": 0.101, "num_tokens": 42352254.0, "reward": 0.33984375, "reward_std": 0.09077189117670059, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 634.0859375, "completions/mean_terminated_length": 400.13751220703125, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 1.6115555555555554, "grad_norm": 0.5200813861863092, "kl": 0.059051513671875, "learning_rate": 4.859616286322094e-08, "loss": 0.0393, "num_tokens": 42452841.0, "reward": 0.3046875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 476.734375, "completions/mean_terminated_length": 330.4356384277344, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 1.6151111111111112, "grad_norm": 0.4239479171884541, "kl": 0.06365966796875, "learning_rate": 4.774575140626316e-08, "loss": 0.0457, "num_tokens": 42533215.0, "reward": 0.3828125, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 599.125, "completions/mean_terminated_length": 360.7804870605469, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.6186666666666667, "grad_norm": 0.3912823652117178, "kl": 0.05731201171875, "learning_rate": 4.6902060474159036e-08, "loss": 0.0779, "num_tokens": 42629283.0, "reward": 0.3203125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 514.859375, "completions/mean_terminated_length": 345.1458435058594, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.6222222222222222, "grad_norm": 0.6284694289868051, "kl": 0.0572509765625, "learning_rate": 4.6065118101157016e-08, "loss": 0.0813, "num_tokens": 42714569.0, "reward": 0.375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 791.0, "completions/mean_length": 465.484375, "completions/mean_terminated_length": 294.51019287109375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 1.6257777777777778, "grad_norm": 0.74834512218541, "kl": 0.0626220703125, "learning_rate": 4.5234952097263965e-08, "loss": 0.1441, "num_tokens": 42793459.0, "reward": 0.3984375, "reward_std": 0.10892494767904282, "rewards/equation_reward_func/mean": 0.765625, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 1005.0, "completions/mean_length": 516.4609375, "completions/mean_terminated_length": 347.28125, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.6293333333333333, "grad_norm": 0.47224321424996407, "kl": 0.058563232421875, "learning_rate": 4.4411590047320617e-08, "loss": 0.0715, "num_tokens": 42878882.0, "reward": 0.3671875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 669.234375, "completions/mean_terminated_length": 384.4225158691406, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.6328888888888888, "grad_norm": 0.4327215359196415, "kl": 0.052734375, "learning_rate": 4.359505931008553e-08, "loss": 0.0743, "num_tokens": 42983948.0, "reward": 0.27734375, "reward_std": 0.06612578779459, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 544.6015625, "completions/mean_terminated_length": 318.67816162109375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.6364444444444444, "grad_norm": 0.2976770076211903, "kl": 0.05694580078125, "learning_rate": 4.278538701732534e-08, "loss": 0.0173, "num_tokens": 43073057.0, "reward": 0.34765625, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 522.8671875, "completions/mean_terminated_length": 341.60638427734375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.6400000000000001, "grad_norm": 0.47808895254215406, "kl": 0.058319091796875, "learning_rate": 4.198260007291399e-08, "loss": 0.0571, "num_tokens": 43159364.0, "reward": 0.36328125, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 651.5859375, "completions/mean_terminated_length": 371.0, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 1.6435555555555554, "grad_norm": 0.39165305050300414, "kl": 0.0579833984375, "learning_rate": 4.118672515193794e-08, "loss": 0.0313, "num_tokens": 43262163.0, "reward": 0.27734375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.546875, "rewards/equation_reward_func/std": 0.4997538626194, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 524.21875, "completions/mean_terminated_length": 328.6521911621094, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 1.6471111111111112, "grad_norm": 0.38925167825407714, "kl": 0.057708740234375, "learning_rate": 4.039778869981064e-08, "loss": 0.0099, "num_tokens": 43348619.0, "reward": 0.3515625, "reward_std": 0.04400775954127312, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 555.5703125, "completions/mean_terminated_length": 379.2795715332031, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.6506666666666665, "grad_norm": 0.5856622403302902, "kl": 0.062255859375, "learning_rate": 3.961581693139307e-08, "loss": 0.1022, "num_tokens": 43439132.0, "reward": 0.359375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 654.1328125, "completions/mean_terminated_length": 401.0657958984375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.6542222222222223, "grad_norm": 0.514721117483945, "kl": 0.054901123046875, "learning_rate": 3.884083583012318e-08, "loss": 0.065, "num_tokens": 43542329.0, "reward": 0.30078125, "reward_std": 0.0973757952451706, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.1953125, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 475.34375, "completions/mean_terminated_length": 342.1747741699219, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 1.6577777777777778, "grad_norm": 0.47945649570083787, "kl": 0.05755615234375, "learning_rate": 3.807287114715216e-08, "loss": 0.0743, "num_tokens": 43622509.0, "reward": 0.39453125, "reward_std": 0.06986245512962341, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 606.890625, "completions/mean_terminated_length": 388.4047546386719, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.6613333333333333, "grad_norm": 0.430219248736202, "kl": 0.056121826171875, "learning_rate": 3.731194840048915e-08, "loss": 0.0864, "num_tokens": 43719615.0, "reward": 0.31640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 562.1796875, "completions/mean_terminated_length": 359.8089904785156, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.6648888888888889, "grad_norm": 0.29046733493864724, "kl": 0.0584716796875, "learning_rate": 3.655809287415284e-08, "loss": 0.0015, "num_tokens": 43810990.0, "reward": 0.3359375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.5625, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 720.46875, "completions/mean_terminated_length": 330.21429443359375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.6684444444444444, "grad_norm": 0.33918520057925033, "kl": 0.05792236328125, "learning_rate": 3.581132961733191e-08, "loss": 0.0319, "num_tokens": 43922610.0, "reward": 0.2265625, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.4375, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 579.4296875, "completions/mean_terminated_length": 338.3975830078125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.6720000000000002, "grad_norm": 0.34945263291876827, "kl": 0.056488037109375, "learning_rate": 3.5071683443552045e-08, "loss": 0.0448, "num_tokens": 44016189.0, "reward": 0.328125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 495.0859375, "completions/mean_terminated_length": 340.1515197753906, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.6755555555555555, "grad_norm": 0.3168834960432101, "kl": 0.05743408203125, "learning_rate": 3.433917892985208e-08, "loss": 0.0349, "num_tokens": 44098912.0, "reward": 0.37890625, "reward_std": 0.03487579524517059, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 597.5859375, "completions/mean_terminated_length": 381.87060546875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.6791111111111112, "grad_norm": 0.5331586070453482, "kl": 0.05731201171875, "learning_rate": 3.3613840415966764e-08, "loss": 0.0554, "num_tokens": 44194827.0, "reward": 0.3359375, "reward_std": 0.10518828779459, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 836.0, "completions/mean_length": 555.5703125, "completions/mean_terminated_length": 350.3033752441406, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 1.6826666666666665, "grad_norm": 0.295219673427889, "kl": 0.06268310546875, "learning_rate": 3.2895692003518575e-08, "loss": 0.0384, "num_tokens": 44285384.0, "reward": 0.35546875, "reward_std": 0.0390625, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 610.4296875, "completions/mean_terminated_length": 370.456787109375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 1.6862222222222223, "grad_norm": 0.4671220171137538, "kl": 0.06011962890625, "learning_rate": 3.218475755521621e-08, "loss": 0.0798, "num_tokens": 44382935.0, "reward": 0.30078125, "reward_std": 0.07514689117670059, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 590.1015625, "completions/mean_terminated_length": 311.9615478515625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.6897777777777778, "grad_norm": 0.3180288080836337, "kl": 0.05328369140625, "learning_rate": 3.1481060694062365e-08, "loss": 0.0313, "num_tokens": 44477916.0, "reward": 0.3046875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 553.4375, "completions/mean_terminated_length": 403.0515441894531, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.6933333333333334, "grad_norm": 0.47389492626451124, "kl": 0.055755615234375, "learning_rate": 3.078462480256819e-08, "loss": 0.049, "num_tokens": 44568196.0, "reward": 0.37890625, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 521.5, "completions/mean_terminated_length": 360.9071960449219, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.696888888888889, "grad_norm": 0.5371588748510453, "kl": 0.057159423828125, "learning_rate": 3.0095473021976794e-08, "loss": 0.0901, "num_tokens": 44654344.0, "reward": 0.3828125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 712.0, "completions/mean_length": 559.609375, "completions/mean_terminated_length": 290.1481628417969, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 1.7004444444444444, "grad_norm": 0.4605704875155139, "kl": 0.055511474609375, "learning_rate": 2.9413628251493934e-08, "loss": 0.0858, "num_tokens": 44745286.0, "reward": 0.3203125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 619.46875, "completions/mean_terminated_length": 392.53656005859375, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.704, "grad_norm": 0.5009759043201805, "kl": 0.052764892578125, "learning_rate": 2.8739113147527417e-08, "loss": 0.0655, "num_tokens": 44844010.0, "reward": 0.30859375, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 627.8984375, "completions/mean_terminated_length": 347.9866638183594, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.7075555555555555, "grad_norm": 0.502236882705769, "kl": 0.05657958984375, "learning_rate": 2.8071950122934036e-08, "loss": 0.084, "num_tokens": 44943793.0, "reward": 0.30078125, "reward_std": 0.08548745512962341, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 505.71875, "completions/mean_terminated_length": 310.6666564941406, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.7111111111111112, "grad_norm": 0.4962459638883198, "kl": 0.053558349609375, "learning_rate": 2.7412161346275052e-08, "loss": 0.0592, "num_tokens": 45027877.0, "reward": 0.35546875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 549.8203125, "completions/mean_terminated_length": 371.3656005859375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.7146666666666666, "grad_norm": 0.3788449113418335, "kl": 0.059844970703125, "learning_rate": 2.675976874107935e-08, "loss": 0.0699, "num_tokens": 45117634.0, "reward": 0.35546875, "reward_std": 0.07745979726314545, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 574.171875, "completions/mean_terminated_length": 362.18389892578125, "completions/min_length": 30.0, "completions/min_terminated_length": 30.0, "epoch": 1.7182222222222223, "grad_norm": 0.4705395797183526, "kl": 0.054595947265625, "learning_rate": 2.611479398511518e-08, "loss": 0.0439, "num_tokens": 45210544.0, "reward": 0.33203125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 577.015625, "completions/mean_terminated_length": 350.8941345214844, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.7217777777777776, "grad_norm": 0.5108796025381747, "kl": 0.0528564453125, "learning_rate": 2.5477258509669614e-08, "loss": 0.0594, "num_tokens": 45303746.0, "reward": 0.32421875, "reward_std": 0.07933359593153, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 638.984375, "completions/mean_terminated_length": 400.1772155761719, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.7253333333333334, "grad_norm": 0.4714328834857556, "kl": 0.051239013671875, "learning_rate": 2.4847183498836714e-08, "loss": 0.0977, "num_tokens": 45404960.0, "reward": 0.2890625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.578125, "rewards/equation_reward_func/std": 0.4957992732524872, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 643.453125, "completions/mean_terminated_length": 407.417724609375, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.728888888888889, "grad_norm": 0.4109107662837105, "kl": 0.05389404296875, "learning_rate": 2.4224589888813263e-08, "loss": 0.0627, "num_tokens": 45506790.0, "reward": 0.3125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 966.0, "completions/mean_length": 613.5390625, "completions/mean_terminated_length": 367.26251220703125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 1.7324444444444445, "grad_norm": 0.41529721171353823, "kl": 0.051055908203125, "learning_rate": 2.3609498367203467e-08, "loss": 0.0477, "num_tokens": 45604743.0, "reward": 0.30859375, "reward_std": 0.0703125, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 581.59375, "completions/mean_terminated_length": 333.41461181640625, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 1.736, "grad_norm": 0.5231983110010945, "kl": 0.0560302734375, "learning_rate": 2.300192937233128e-08, "loss": 0.111, "num_tokens": 45698575.0, "reward": 0.31640625, "reward_std": 0.09737578779459, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 507.3671875, "completions/mean_terminated_length": 297.3077087402344, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.7395555555555555, "grad_norm": 0.4507770804313105, "kl": 0.059295654296875, "learning_rate": 2.240190309256143e-08, "loss": 0.0727, "num_tokens": 45782910.0, "reward": 0.3515625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1001.0, "completions/mean_length": 498.3671875, "completions/mean_terminated_length": 412.3545227050781, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.743111111111111, "grad_norm": 0.7028393369773552, "kl": 0.059844970703125, "learning_rate": 2.1809439465628382e-08, "loss": 0.1122, "num_tokens": 45866137.0, "reward": 0.4296875, "reward_std": 0.135988250374794, "rewards/equation_reward_func/mean": 0.828125, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 547.75, "completions/mean_terminated_length": 361.39129638671875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 1.7466666666666666, "grad_norm": 0.4445079281118488, "kl": 0.053070068359375, "learning_rate": 2.122455817797428e-08, "loss": 0.0829, "num_tokens": 45955613.0, "reward": 0.36328125, "reward_std": 0.060841355472803116, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.171875, "completions/max_length": 1024.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 459.7734375, "completions/mean_terminated_length": 342.6698303222656, "completions/min_length": 25.0, "completions/min_terminated_length": 25.0, "epoch": 1.7502222222222223, "grad_norm": 0.6406923595255156, "kl": 0.054931640625, "learning_rate": 2.0647278664094188e-08, "loss": 0.0575, "num_tokens": 46033796.0, "reward": 0.40625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.8046875, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1003.0, "completions/mean_length": 607.0390625, "completions/mean_terminated_length": 356.8625183105469, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.7537777777777777, "grad_norm": 0.4525261109127069, "kl": 0.053253173828125, "learning_rate": 2.007762010589098e-08, "loss": 0.0817, "num_tokens": 46130881.0, "reward": 0.3046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 927.0, "completions/mean_length": 614.1953125, "completions/mean_terminated_length": 342.7662353515625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.7573333333333334, "grad_norm": 0.5510868637122652, "kl": 0.0562744140625, "learning_rate": 1.9515601432037317e-08, "loss": 0.0712, "num_tokens": 46228874.0, "reward": 0.30078125, "reward_std": 0.0859375, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 570.0390625, "completions/mean_terminated_length": 378.3666687011719, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.7608888888888887, "grad_norm": 0.3574531605850586, "kl": 0.05596923828125, "learning_rate": 1.8961241317347333e-08, "loss": 0.045, "num_tokens": 46321259.0, "reward": 0.35546875, "reward_std": 0.05710469186306, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 515.7265625, "completions/mean_terminated_length": 339.1684265136719, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 1.7644444444444445, "grad_norm": 0.5871430307510175, "kl": 0.05511474609375, "learning_rate": 1.8414558182155456e-08, "loss": 0.0348, "num_tokens": 46406632.0, "reward": 0.390625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.046875, "rewards/format_reward_func/std": 0.21220162510871887, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 997.0, "completions/mean_length": 514.890625, "completions/mean_terminated_length": 359.0408020019531, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 1.768, "grad_norm": 0.44907653756293736, "kl": 0.0579833984375, "learning_rate": 1.787557019170488e-08, "loss": 0.0124, "num_tokens": 46491918.0, "reward": 0.3671875, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 564.6953125, "completions/mean_terminated_length": 307.03656005859375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.7715555555555556, "grad_norm": 0.32078795471236476, "kl": 0.05303955078125, "learning_rate": 1.734429525554365e-08, "loss": 0.0527, "num_tokens": 46583583.0, "reward": 0.31640625, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 528.015625, "completions/mean_terminated_length": 369.5051574707031, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.775111111111111, "grad_norm": 0.5800601104355154, "kl": 0.05535888671875, "learning_rate": 1.6820751026929674e-08, "loss": 0.1005, "num_tokens": 46670537.0, "reward": 0.3671875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 654.109375, "completions/mean_terminated_length": 327.73529052734375, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.7786666666666666, "grad_norm": 0.43454438167083137, "kl": 0.05303955078125, "learning_rate": 1.6304954902244095e-08, "loss": 0.0601, "num_tokens": 46773703.0, "reward": 0.265625, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 946.0, "completions/mean_length": 630.4296875, "completions/mean_terminated_length": 361.1447448730469, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 1.7822222222222224, "grad_norm": 0.41168005001264985, "kl": 0.051910400390625, "learning_rate": 1.5796924020413327e-08, "loss": 0.0569, "num_tokens": 46873814.0, "reward": 0.28125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 938.0, "completions/mean_length": 630.9921875, "completions/mean_terminated_length": 387.22784423828125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.7857777777777777, "grad_norm": 0.38234592197979783, "kl": 0.053741455078125, "learning_rate": 1.529667526233941e-08, "loss": 0.0646, "num_tokens": 46973953.0, "reward": 0.30859375, "reward_std": 0.06744526326656342, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4140625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1006.0, "completions/mean_length": 627.203125, "completions/mean_terminated_length": 346.8000183105469, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.7893333333333334, "grad_norm": 0.3955593725242068, "kl": 0.05096435546875, "learning_rate": 1.4804225250339281e-08, "loss": 0.0386, "num_tokens": 47073651.0, "reward": 0.28125, "reward_std": 0.04929219186306, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 602.8359375, "completions/mean_terminated_length": 341.60760498046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 1.7928888888888888, "grad_norm": 0.2739606672132355, "kl": 0.05059814453125, "learning_rate": 1.4319590347592254e-08, "loss": 0.0239, "num_tokens": 47170214.0, "reward": 0.30078125, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 534.6328125, "completions/mean_terminated_length": 343.14129638671875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.7964444444444445, "grad_norm": 0.5136022239517629, "kl": 0.056610107421875, "learning_rate": 1.3842786657596446e-08, "loss": 0.1087, "num_tokens": 47257995.0, "reward": 0.35546875, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 559.1015625, "completions/mean_terminated_length": 347.7840881347656, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 1.8, "grad_norm": 0.4373449634777124, "kl": 0.053314208984375, "learning_rate": 1.3373830023633597e-08, "loss": 0.0671, "num_tokens": 47348940.0, "reward": 0.3359375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 945.0, "completions/mean_length": 540.0625, "completions/mean_terminated_length": 328.0, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.8035555555555556, "grad_norm": 0.2824473656048655, "kl": 0.05169677734375, "learning_rate": 1.2912736028242777e-08, "loss": 0.0424, "num_tokens": 47437444.0, "reward": 0.33984375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.234375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 523.2734375, "completions/mean_terminated_length": 369.9897766113281, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 1.8071111111111111, "grad_norm": 0.3418346617012243, "kl": 0.0579833984375, "learning_rate": 1.2459519992702311e-08, "loss": 0.0217, "num_tokens": 47523863.0, "reward": 0.375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4375, "completions/max_length": 1024.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 658.5078125, "completions/mean_terminated_length": 374.2361145019531, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.8106666666666666, "grad_norm": 0.39219547756906314, "kl": 0.0498046875, "learning_rate": 1.2014196976521035e-08, "loss": 0.0737, "num_tokens": 47627568.0, "reward": 0.26953125, "reward_std": 0.057104695588350296, "rewards/equation_reward_func/mean": 0.5390625, "rewards/equation_reward_func/std": 0.5004304051399231, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 502.234375, "completions/mean_terminated_length": 349.3939514160156, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.8142222222222222, "grad_norm": 0.4021581606856545, "kl": 0.056671142578125, "learning_rate": 1.1576781776937634e-08, "loss": 0.0267, "num_tokens": 47711230.0, "reward": 0.36328125, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 510.9921875, "completions/mean_terminated_length": 339.9895935058594, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.8177777777777777, "grad_norm": 0.553735588423851, "kl": 0.053253173828125, "learning_rate": 1.1147288928429116e-08, "loss": 0.0687, "num_tokens": 47795985.0, "reward": 0.3828125, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.75, "rewards/equation_reward_func/std": 0.434714138507843, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 618.5078125, "completions/mean_terminated_length": 367.0, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.8213333333333335, "grad_norm": 0.2547502317957006, "kl": 0.05340576171875, "learning_rate": 1.0725732702227735e-08, "loss": 0.024, "num_tokens": 47894630.0, "reward": 0.30859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 568.859375, "completions/mean_terminated_length": 369.4157409667969, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.8248888888888888, "grad_norm": 0.4960338161580114, "kl": 0.055694580078125, "learning_rate": 1.0312127105846947e-08, "loss": 0.1012, "num_tokens": 47986880.0, "reward": 0.33984375, "reward_std": 0.08835469186306, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.34375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 596.6640625, "completions/mean_terminated_length": 372.8214416503906, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 1.8284444444444445, "grad_norm": 0.382027963322798, "kl": 0.052764892578125, "learning_rate": 9.906485882615695e-09, "loss": 0.0608, "num_tokens": 48082637.0, "reward": 0.328125, "reward_std": 0.05589609593153, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 625.265625, "completions/mean_terminated_length": 377.9493713378906, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 1.8319999999999999, "grad_norm": 0.381532972124932, "kl": 0.05010986328125, "learning_rate": 9.50882251122212e-09, "loss": 0.0598, "num_tokens": 48182139.0, "reward": 0.30078125, "reward_std": 0.10397969186306, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 974.0, "completions/mean_length": 513.3359375, "completions/mean_terminated_length": 350.1340026855469, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 1.8355555555555556, "grad_norm": 0.42444958369066205, "kl": 0.0545654296875, "learning_rate": 9.119150205265324e-09, "loss": 0.0427, "num_tokens": 48267194.0, "reward": 0.37109375, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.265625, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 566.1640625, "completions/mean_terminated_length": 400.5638122558594, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.8391111111111111, "grad_norm": 0.45647319227740496, "kl": 0.054229736328125, "learning_rate": 8.737481912816592e-09, "loss": 0.083, "num_tokens": 48359135.0, "reward": 0.37109375, "reward_std": 0.07646635919809341, "rewards/equation_reward_func/mean": 0.7265625, "rewards/equation_reward_func/std": 0.447474867105484, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.40625, "completions/max_length": 1024.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 621.859375, "completions/mean_terminated_length": 346.7105407714844, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 1.8426666666666667, "grad_norm": 0.43538432713032377, "kl": 0.050048828125, "learning_rate": 8.363830315988945e-09, "loss": 0.0676, "num_tokens": 48458137.0, "reward": 0.2890625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.5703125, "rewards/equation_reward_func/std": 0.4969765841960907, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 531.5859375, "completions/mean_terminated_length": 307.7613830566406, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.8462222222222222, "grad_norm": 0.49444611438833674, "kl": 0.052520751953125, "learning_rate": 7.99820783051583e-09, "loss": 0.1031, "num_tokens": 48545568.0, "reward": 0.34375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 532.671875, "completions/mean_terminated_length": 340.4130554199219, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 1.8497777777777777, "grad_norm": 0.6028126134834118, "kl": 0.0516357421875, "learning_rate": 7.640626605338624e-09, "loss": 0.0244, "num_tokens": 48633162.0, "reward": 0.359375, "reward_std": 0.0739382952451706, "rewards/equation_reward_func/mean": 0.703125, "rewards/equation_reward_func/std": 0.45867621898651123, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 615.6171875, "completions/mean_terminated_length": 370.5874938964844, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 1.8533333333333335, "grad_norm": 0.3798166678202122, "kl": 0.049896240234375, "learning_rate": 7.291098522202776e-09, "loss": 0.0856, "num_tokens": 48731381.0, "reward": 0.3125, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 847.0, "completions/mean_length": 578.796875, "completions/mean_terminated_length": 311.6750183105469, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.8568888888888888, "grad_norm": 0.39167392788845884, "kl": 0.052398681640625, "learning_rate": 6.949635195263259e-09, "loss": 0.0561, "num_tokens": 48824803.0, "reward": 0.31640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 982.0, "completions/mean_length": 607.4921875, "completions/mean_terminated_length": 418.17047119140625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.8604444444444446, "grad_norm": 0.4150767438958431, "kl": 0.049560546875, "learning_rate": 6.616247970698319e-09, "loss": 0.0731, "num_tokens": 48921974.0, "reward": 0.33984375, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 984.0, "completions/mean_length": 632.78125, "completions/mean_terminated_length": 373.6623229980469, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 1.8639999999999999, "grad_norm": 0.36105352200021407, "kl": 0.055908203125, "learning_rate": 6.290947926332835e-09, "loss": 0.0507, "num_tokens": 49022410.0, "reward": 0.3046875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 628.9375, "completions/mean_terminated_length": 407.3170471191406, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 1.8675555555555556, "grad_norm": 0.4565367882168917, "kl": 0.053741455078125, "learning_rate": 5.97374587126992e-09, "loss": 0.0384, "num_tokens": 49122322.0, "reward": 0.328125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 998.0, "completions/mean_length": 614.5390625, "completions/mean_terminated_length": 384.8414611816406, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 1.871111111111111, "grad_norm": 0.5295177215612841, "kl": 0.05120849609375, "learning_rate": 5.664652345531845e-09, "loss": 0.0941, "num_tokens": 49220423.0, "reward": 0.296875, "reward_std": 0.07206448912620544, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3125, "completions/max_length": 1024.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 578.3828125, "completions/mean_terminated_length": 375.8295593261719, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.8746666666666667, "grad_norm": 0.3916469984098702, "kl": 0.053741455078125, "learning_rate": 5.363677619709933e-09, "loss": 0.0744, "num_tokens": 49313812.0, "reward": 0.3515625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.390625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 649.28125, "completions/mean_terminated_length": 409.0769348144531, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 1.8782222222222222, "grad_norm": 0.25806616303229807, "kl": 0.0523681640625, "learning_rate": 5.070831694623135e-09, "loss": 0.0302, "num_tokens": 49416376.0, "reward": 0.30859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 524.3515625, "completions/mean_terminated_length": 350.78948974609375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 1.8817777777777778, "grad_norm": 0.47297045354993056, "kl": 0.053863525390625, "learning_rate": 4.786124300985822e-09, "loss": 0.1178, "num_tokens": 49502897.0, "reward": 0.36328125, "reward_std": 0.07272969186306, "rewards/equation_reward_func/mean": 0.71875, "rewards/equation_reward_func/std": 0.4513758420944214, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 868.0, "completions/mean_length": 590.84375, "completions/mean_terminated_length": 371.7176513671875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 1.8853333333333333, "grad_norm": 0.5675269326054602, "kl": 0.05419921875, "learning_rate": 4.509564899084328e-09, "loss": 0.0788, "num_tokens": 49597929.0, "reward": 0.3203125, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 567.859375, "completions/mean_terminated_length": 345.093017578125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 1.8888888888888888, "grad_norm": 0.45658089174297584, "kl": 0.057708740234375, "learning_rate": 4.241162678462806e-09, "loss": 0.0634, "num_tokens": 49690003.0, "reward": 0.3359375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 512.2734375, "completions/mean_terminated_length": 341.69793701171875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 1.8924444444444446, "grad_norm": 0.4953474998748152, "kl": 0.058685302734375, "learning_rate": 3.9809265576176146e-09, "loss": 0.0634, "num_tokens": 49774934.0, "reward": 0.375, "reward_std": 0.07152109593153, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3828125, "completions/max_length": 1024.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 597.6484375, "completions/mean_terminated_length": 333.2025451660156, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 1.896, "grad_norm": 0.30647515182947954, "kl": 0.0543212890625, "learning_rate": 3.7288651837012745e-09, "loss": 0.0266, "num_tokens": 49870845.0, "reward": 0.30859375, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 637.640625, "completions/mean_terminated_length": 381.7402648925781, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 1.8995555555555557, "grad_norm": 0.35163577505956356, "kl": 0.05169677734375, "learning_rate": 3.4849869322348126e-09, "loss": 0.0611, "num_tokens": 49971911.0, "reward": 0.296875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.59375, "rewards/equation_reward_func/std": 0.4930621087551117, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3671875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 595.828125, "completions/mean_terminated_length": 347.3827209472656, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.903111111111111, "grad_norm": 0.3111593633155223, "kl": 0.054534912109375, "learning_rate": 3.249299906829761e-09, "loss": 0.0272, "num_tokens": 50067609.0, "reward": 0.30078125, "reward_std": 0.03245859593153, "rewards/equation_reward_func/mean": 0.6015625, "rewards/equation_reward_func/std": 0.4915000796318054, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2578125, "completions/max_length": 1024.0, "completions/max_terminated_length": 965.0, "completions/mean_length": 523.9296875, "completions/mean_terminated_length": 350.2210693359375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 1.9066666666666667, "grad_norm": 0.5520115118264982, "kl": 0.0555419921875, "learning_rate": 3.0218119389186502e-09, "loss": 0.1127, "num_tokens": 50154052.0, "reward": 0.375, "reward_std": 0.09912778437137604, "rewards/equation_reward_func/mean": 0.734375, "rewards/equation_reward_func/std": 0.44340085983276367, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 609.9140625, "completions/mean_terminated_length": 335.64935302734375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.9102222222222223, "grad_norm": 0.4801949131639632, "kl": 0.054931640625, "learning_rate": 2.8025305874949945e-09, "loss": 0.0821, "num_tokens": 50251557.0, "reward": 0.3046875, "reward_std": 0.09439767897129059, "rewards/equation_reward_func/mean": 0.5859375, "rewards/equation_reward_func/std": 0.49449479579925537, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4453125, "completions/max_length": 1024.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 667.4921875, "completions/mean_terminated_length": 381.28167724609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 1.9137777777777778, "grad_norm": 0.42612610203096174, "kl": 0.04840087890625, "learning_rate": 2.5914631388619103e-09, "loss": 0.0628, "num_tokens": 50356488.0, "reward": 0.265625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.53125, "rewards/equation_reward_func/std": 0.5009832978248596, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2890625, "completions/max_length": 1024.0, "completions/max_terminated_length": 1017.0, "completions/mean_length": 530.546875, "completions/mean_terminated_length": 329.912109375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 1.9173333333333333, "grad_norm": 0.4708232230872337, "kl": 0.05560302734375, "learning_rate": 2.388616606390198e-09, "loss": 0.058, "num_tokens": 50443766.0, "reward": 0.3515625, "reward_std": 0.07767495512962341, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 574.609375, "completions/mean_terminated_length": 355.1395263671875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 1.9208888888888889, "grad_norm": 0.48940225962215, "kl": 0.053619384765625, "learning_rate": 2.193997730285141e-09, "loss": 0.1203, "num_tokens": 50536800.0, "reward": 0.3359375, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 549.828125, "completions/mean_terminated_length": 342.0449523925781, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 1.9244444444444444, "grad_norm": 0.4818281452077839, "kl": 0.05609130859375, "learning_rate": 2.0076129773627103e-09, "loss": 0.0888, "num_tokens": 50626586.0, "reward": 0.34765625, "reward_std": 0.06370859593153, "rewards/equation_reward_func/mean": 0.6875, "rewards/equation_reward_func/std": 0.4653336703777313, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.46875, "completions/max_length": 1024.0, "completions/max_terminated_length": 992.0, "completions/mean_length": 649.671875, "completions/mean_terminated_length": 319.3823547363281, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 1.928, "grad_norm": 0.5001563666902327, "kl": 0.048675537109375, "learning_rate": 1.8294685408345167e-09, "loss": 0.0914, "num_tokens": 50729164.0, "reward": 0.2578125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.515625, "rewards/equation_reward_func/std": 0.5017194747924805, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 878.0, "completions/mean_length": 605.2578125, "completions/mean_terminated_length": 378.2289123535156, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 1.9315555555555557, "grad_norm": 0.3607527633449463, "kl": 0.051025390625, "learning_rate": 1.6595703401020844e-09, "loss": 0.0582, "num_tokens": 50826037.0, "reward": 0.31640625, "reward_std": 0.060841359198093414, "rewards/equation_reward_func/mean": 0.625, "rewards/equation_reward_func/std": 0.4860251843929291, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 976.0, "completions/mean_length": 621.125, "completions/mean_terminated_length": 402.69879150390625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 1.935111111111111, "grad_norm": 0.49396643714496463, "kl": 0.053375244140625, "learning_rate": 1.497924020560204e-09, "loss": 0.0621, "num_tokens": 50924941.0, "reward": 0.31640625, "reward_std": 0.08889809250831604, "rewards/equation_reward_func/mean": 0.6171875, "rewards/equation_reward_func/std": 0.4879830479621887, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.28125, "completions/max_length": 1024.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 539.1171875, "completions/mean_terminated_length": 349.38043212890625, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 1.9386666666666668, "grad_norm": 0.4231547961400024, "kl": 0.052154541015625, "learning_rate": 1.3445349534093598e-09, "loss": 0.089, "num_tokens": 51013304.0, "reward": 0.359375, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.7109375, "rewards/equation_reward_func/std": 0.45510825514793396, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2734375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1002.0, "completions/mean_length": 534.078125, "completions/mean_terminated_length": 349.69891357421875, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 1.942222222222222, "grad_norm": 0.5163762510832167, "kl": 0.05560302734375, "learning_rate": 1.199408235477123e-09, "loss": 0.0752, "num_tokens": 51101062.0, "reward": 0.359375, "reward_std": 0.08295938372612, "rewards/equation_reward_func/mean": 0.6953125, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3203125, "completions/max_length": 1024.0, "completions/max_terminated_length": 954.0, "completions/mean_length": 573.703125, "completions/mean_terminated_length": 361.4942626953125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.9457777777777778, "grad_norm": 0.35284950428929734, "kl": 0.05389404296875, "learning_rate": 1.0625486890488978e-09, "loss": 0.0578, "num_tokens": 51193904.0, "reward": 0.34765625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.4765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 679.03125, "completions/mean_terminated_length": 364.9552001953125, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 1.9493333333333334, "grad_norm": 0.3920190719921376, "kl": 0.0496826171875, "learning_rate": 9.339608617077165e-10, "loss": 0.0599, "num_tokens": 51300288.0, "reward": 0.26171875, "reward_std": 0.0661257952451706, "rewards/equation_reward_func/mean": 0.5234375, "rewards/equation_reward_func/std": 0.5014128684997559, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3046875, "completions/max_length": 1024.0, "completions/max_terminated_length": 892.0, "completions/mean_length": 562.109375, "completions/mean_terminated_length": 359.7078552246094, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.952888888888889, "grad_norm": 0.4403364111370624, "kl": 0.054656982421875, "learning_rate": 8.136490261830553e-10, "loss": 0.0523, "num_tokens": 51391634.0, "reward": 0.34375, "reward_std": 0.06964729726314545, "rewards/equation_reward_func/mean": 0.671875, "rewards/equation_reward_func/std": 0.4713755249977112, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2109375, "completions/max_length": 1024.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 489.6875, "completions/mean_terminated_length": 346.8514709472656, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.9564444444444444, "grad_norm": 0.5470904066225087, "kl": 0.056732177734375, "learning_rate": 7.016171802088633e-10, "loss": 0.0768, "num_tokens": 51473702.0, "reward": 0.40625, "reward_std": 0.07239051908254623, "rewards/equation_reward_func/mean": 0.78125, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 587.671875, "completions/mean_terminated_length": 342.9024353027344, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 1.96, "grad_norm": 0.5029060333619112, "kl": 0.057769775390625, "learning_rate": 5.978690463908087e-10, "loss": 0.0765, "num_tokens": 51568392.0, "reward": 0.31640625, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3359375, "completions/max_length": 1024.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 591.59375, "completions/mean_terminated_length": 372.8470764160156, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.9635555555555557, "grad_norm": 0.436921808105237, "kl": 0.052886962890625, "learning_rate": 5.024080720824608e-10, "loss": 0.0341, "num_tokens": 51663544.0, "reward": 0.33203125, "reward_std": 0.060841359198093414, "rewards/equation_reward_func/mean": 0.6484375, "rewards/equation_reward_func/std": 0.4793342351913452, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 525.0546875, "completions/mean_terminated_length": 314.3888854980469, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 1.967111111111111, "grad_norm": 0.5465456147130165, "kl": 0.056243896484375, "learning_rate": 4.152374292708538e-10, "loss": 0.0758, "num_tokens": 51750119.0, "reward": 0.34375, "reward_std": 0.08714609593153, "rewards/equation_reward_func/mean": 0.65625, "rewards/equation_reward_func/std": 0.47682511806488037, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.296875, "completions/max_length": 1024.0, "completions/max_terminated_length": 949.0, "completions/mean_length": 556.5859375, "completions/mean_terminated_length": 359.23333740234375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 1.9706666666666668, "grad_norm": 0.36448485347283277, "kl": 0.0552978515625, "learning_rate": 3.363600144710155e-10, "loss": 0.0551, "num_tokens": 51840782.0, "reward": 0.34375, "reward_std": 0.04027109593153, "rewards/equation_reward_func/mean": 0.6796875, "rewards/equation_reward_func/std": 0.4684300124645233, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.375, "completions/max_length": 1024.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 555.171875, "completions/mean_terminated_length": 273.875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 1.974222222222222, "grad_norm": 0.33716398784456447, "kl": 0.053375244140625, "learning_rate": 2.6577844862973877e-10, "loss": 0.0688, "num_tokens": 51931224.0, "reward": 0.30859375, "reward_std": 0.04808359593153, "rewards/equation_reward_func/mean": 0.609375, "rewards/equation_reward_func/std": 0.4898075461387634, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 597.1875, "completions/mean_terminated_length": 365.7831115722656, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 1.9777777777777779, "grad_norm": 0.38726327782326486, "kl": 0.055511474609375, "learning_rate": 2.0349507703851243e-10, "loss": 0.0481, "num_tokens": 52027128.0, "reward": 0.32421875, "reward_std": 0.0546875, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.3515625, "completions/max_length": 1024.0, "completions/max_terminated_length": 930.0, "completions/mean_length": 578.0078125, "completions/mean_terminated_length": 336.2048034667969, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 1.9813333333333332, "grad_norm": 0.5532498505226859, "kl": 0.06024169921875, "learning_rate": 1.4951196925561127e-10, "loss": 0.1149, "num_tokens": 52120541.0, "reward": 0.3203125, "reward_std": 0.0895632952451706, "rewards/equation_reward_func/mean": 0.6328125, "rewards/equation_reward_func/std": 0.4839322865009308, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 988.0, "completions/mean_length": 582.34375, "completions/mean_terminated_length": 366.6511535644531, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.984888888888889, "grad_norm": 0.5415196262432431, "kl": 0.05242919921875, "learning_rate": 1.0383091903720665e-10, "loss": 0.0849, "num_tokens": 52214393.0, "reward": 0.33984375, "reward_std": 0.11067695170640945, "rewards/equation_reward_func/mean": 0.640625, "rewards/equation_reward_func/std": 0.481702595949173, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.328125, "completions/max_length": 1024.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 596.296875, "completions/mean_terminated_length": 387.4186096191406, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 1.9884444444444445, "grad_norm": 0.4731136889768231, "kl": 0.053436279296875, "learning_rate": 6.645344427794186e-11, "loss": 0.0672, "num_tokens": 52310123.0, "reward": 0.34375, "reward_std": 0.09416937828063965, "rewards/equation_reward_func/mean": 0.6640625, "rewards/equation_reward_func/std": 0.47417303919792175, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.2421875, "completions/max_length": 1024.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 487.3515625, "completions/mean_terminated_length": 315.8453369140625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 1.992, "grad_norm": 0.1394105889802075, "kl": 0.057708740234375, "learning_rate": 3.738078696036151e-11, "loss": 0.0042, "num_tokens": 52391788.0, "reward": 0.37890625, "reward_std": 0.0078125, "rewards/equation_reward_func/mean": 0.7578125, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.25, "completions/max_length": 1024.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 509.703125, "completions/mean_terminated_length": 338.2708435058594, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 1.9955555555555555, "grad_norm": 0.4418318818416417, "kl": 0.0537109375, "learning_rate": 1.6613913113694423e-11, "loss": 0.0734, "num_tokens": 52476418.0, "reward": 0.375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.7421875, "rewards/equation_reward_func/std": 0.43914902210235596, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.42500000000000004, "completions/max_length": 1024.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 647.4000244140625, "completions/mean_terminated_length": 369.0434875488281, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 1.999111111111111, "grad_norm": 0.47313817649981726, "kl": 0.05224609375, "learning_rate": 4.153512781768231e-12, "loss": 0.1221, "num_tokens": 52579693.0, "reward": 0.28125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.5625, "rewards/equation_reward_func/std": 0.49802759289741516, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 562 }, { "epoch": 1.999111111111111, "step": 562, "total_flos": 0.0, "train_loss": 0.06262669591581137, "train_runtime": 18440.1211, "train_samples_per_second": 0.976, "train_steps_per_second": 0.03 } ], "logging_steps": 1, "max_steps": 562, "num_input_tokens_seen": 52579693, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }