{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 1.999111111111111, "eval_steps": 500, "global_step": 562, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 1021.0, "completions/mean_length": 1023.9765625, "completions/mean_terminated_length": 1021.0, "completions/min_length": 1021.0, "completions/min_terminated_length": 1021.0, "epoch": 0.0035555555555555557, "grad_norm": 0.4749051611172313, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0, "num_tokens": 149985.0, "reward": 0.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.0071111111111111115, "grad_norm": 0.26760539189603993, "kl": 0.0, "learning_rate": 2.941176470588235e-08, "loss": -0.0, "num_tokens": 300121.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.010666666666666666, "grad_norm": 0.0028060903814212013, "kl": 0.0013895034790039062, "learning_rate": 5.88235294117647e-08, "loss": 0.0, "num_tokens": 450185.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.014222222222222223, "grad_norm": 0.6812448616547815, "kl": 0.0010929107666015625, "learning_rate": 8.823529411764706e-08, "loss": 0.0, "num_tokens": 600217.0, "reward": 0.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.017777777777777778, "grad_norm": 0.6435510901521116, "kl": 0.0011014938354492188, "learning_rate": 1.176470588235294e-07, "loss": 0.0, "num_tokens": 750341.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.021333333333333333, "grad_norm": 0.6266308317154681, "kl": 0.0011110305786132812, "learning_rate": 1.4705882352941175e-07, "loss": 0.0, "num_tokens": 900441.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.024888888888888887, "grad_norm": 0.25625385514157745, "kl": 0.0012025833129882812, "learning_rate": 1.764705882352941e-07, "loss": 0.0, "num_tokens": 1050481.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 985.0, "completions/mean_length": 1023.6953125, "completions/mean_terminated_length": 985.0, "completions/min_length": 985.0, "completions/min_terminated_length": 985.0, "epoch": 0.028444444444444446, "grad_norm": 0.35113969256431987, "kl": 0.0011568069458007812, "learning_rate": 2.0588235294117645e-07, "loss": 0.0, "num_tokens": 1200574.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.032, "grad_norm": 0.3067957272358877, "kl": 0.0014123916625976562, "learning_rate": 2.352941176470588e-07, "loss": 0.0, "num_tokens": 1350650.0, "reward": 0.015625, "reward_std": 0.018042195588350296, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.035555555555555556, "grad_norm": 0.46766892530688997, "kl": 0.0012612342834472656, "learning_rate": 2.6470588235294114e-07, "loss": 0.0, "num_tokens": 1500814.0, "reward": 0.0234375, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.03911111111111111, "grad_norm": 0.011064133025813657, "kl": 0.0016937255859375, "learning_rate": 2.941176470588235e-07, "loss": 0.0, "num_tokens": 1650986.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.042666666666666665, "grad_norm": 0.8864693490639021, "kl": 0.0014581680297851562, "learning_rate": 3.2352941176470586e-07, "loss": 0.0, "num_tokens": 1801054.0, "reward": 0.046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 571.0, "completions/mean_length": 1020.4609375, "completions/mean_terminated_length": 571.0, "completions/min_length": 571.0, "completions/min_terminated_length": 571.0, "epoch": 0.04622222222222222, "grad_norm": 0.27411469320570264, "kl": 0.0012121200561523438, "learning_rate": 3.529411764705882e-07, "loss": 0.0, "num_tokens": 1950637.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.049777777777777775, "grad_norm": 0.6114293143562881, "kl": 0.0014543533325195312, "learning_rate": 3.8235294117647053e-07, "loss": 0.0, "num_tokens": 2100741.0, "reward": 0.03125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 761.0, "completions/mean_length": 1016.5390625, "completions/mean_terminated_length": 546.5, "completions/min_length": 332.0, "completions/min_terminated_length": 332.0, "epoch": 0.05333333333333334, "grad_norm": 0.46240265464867203, "kl": 0.0015316009521484375, "learning_rate": 4.117647058823529e-07, "loss": -0.0029, "num_tokens": 2249850.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.05688888888888889, "grad_norm": 0.007644259978373461, "kl": 0.0020570755004882812, "learning_rate": 4.4117647058823526e-07, "loss": 0.0, "num_tokens": 2400006.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 352.0, "completions/mean_length": 1018.75, "completions/mean_terminated_length": 352.0, "completions/min_length": 352.0, "completions/min_terminated_length": 352.0, "epoch": 0.060444444444444446, "grad_norm": 0.8813471531622719, "kl": 0.0014600753784179688, "learning_rate": 4.705882352941176e-07, "loss": 0.0, "num_tokens": 2549386.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.064, "grad_norm": 0.9166755928687534, "kl": 0.004370689392089844, "learning_rate": 5e-07, "loss": 0.0, "num_tokens": 2699362.0, "reward": 0.0390625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.06755555555555555, "grad_norm": 0.01611198673776164, "kl": 0.0020809173583984375, "learning_rate": 4.999958464872182e-07, "loss": 0.0, "num_tokens": 2849478.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07111111111111111, "grad_norm": 0.3398094055120978, "kl": 0.0019540786743164062, "learning_rate": 4.999833860868863e-07, "loss": 0.0, "num_tokens": 2999594.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.07466666666666667, "grad_norm": 0.5848609850333812, "kl": 0.0024175643920898438, "learning_rate": 4.999626192130396e-07, "loss": 0.0, "num_tokens": 3149718.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 417.0, "completions/mean_length": 1019.2578125, "completions/mean_terminated_length": 417.0, "completions/min_length": 417.0, "completions/min_terminated_length": 417.0, "epoch": 0.07822222222222222, "grad_norm": 0.5211934932084907, "kl": 0.0025272369384765625, "learning_rate": 4.99933546555722e-07, "loss": 0.0, "num_tokens": 3299267.0, "reward": 0.0390625, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 537.0, "completions/mean_length": 1015.4140625, "completions/mean_terminated_length": 474.5, "completions/min_length": 412.0, "completions/min_terminated_length": 412.0, "epoch": 0.08177777777777778, "grad_norm": 0.6156765403685096, "kl": 0.0026531219482421875, "learning_rate": 4.998961690809627e-07, "loss": 0.0, "num_tokens": 3448264.0, "reward": 0.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.08533333333333333, "grad_norm": 0.005711216753229753, "kl": 0.003314971923828125, "learning_rate": 4.998504880307444e-07, "loss": 0.0, "num_tokens": 3598432.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.08888888888888889, "grad_norm": 0.30434179033180503, "kl": 0.005374908447265625, "learning_rate": 4.997965049229614e-07, "loss": 0.0, "num_tokens": 3748472.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 1022.2890625, "completions/mean_terminated_length": 805.0, "completions/min_length": 805.0, "completions/min_terminated_length": 805.0, "epoch": 0.09244444444444444, "grad_norm": 0.5767377728942019, "kl": 0.0041370391845703125, "learning_rate": 4.997342215513703e-07, "loss": 0.0, "num_tokens": 3898337.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 956.0, "completions/mean_length": 1022.78125, "completions/mean_terminated_length": 946.0, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.096, "grad_norm": 0.41791059441069284, "kl": 0.0041866302490234375, "learning_rate": 4.99663639985529e-07, "loss": 0.0, "num_tokens": 4048269.0, "reward": 0.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.09955555555555555, "grad_norm": 0.6692921505250378, "kl": 0.008413314819335938, "learning_rate": 4.995847625707292e-07, "loss": 0.0, "num_tokens": 4198269.0, "reward": 0.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.10311111111111111, "grad_norm": 0.4021117685162764, "kl": 0.0123748779296875, "learning_rate": 4.994975919279175e-07, "loss": 0.0, "num_tokens": 4348409.0, "reward": 0.0234375, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 293.0, "completions/mean_length": 1018.2890625, "completions/mean_terminated_length": 293.0, "completions/min_length": 293.0, "completions/min_terminated_length": 293.0, "epoch": 0.10666666666666667, "grad_norm": 0.5008928905800116, "kl": 0.006610870361328125, "learning_rate": 4.994021309536092e-07, "loss": 0.0, "num_tokens": 4497718.0, "reward": 0.0234375, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 1000.0, "completions/mean_length": 1022.7890625, "completions/mean_terminated_length": 946.5, "completions/min_length": 893.0, "completions/min_terminated_length": 893.0, "epoch": 0.11022222222222222, "grad_norm": 0.006481665460282133, "kl": 0.0063934326171875, "learning_rate": 4.992983828197911e-07, "loss": 0.0, "num_tokens": 4647667.0, "reward": 0.0, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.11377777777777778, "grad_norm": 0.28823496155399947, "kl": 0.008678436279296875, "learning_rate": 4.991863509738169e-07, "loss": 0.0, "num_tokens": 4797747.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.11733333333333333, "grad_norm": 0.5803719065604845, "kl": 0.007968902587890625, "learning_rate": 4.990660391382923e-07, "loss": 0.0, "num_tokens": 4947895.0, "reward": 0.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 1022.9609375, "completions/mean_terminated_length": 891.0, "completions/min_length": 891.0, "completions/min_terminated_length": 891.0, "epoch": 0.12088888888888889, "grad_norm": 0.29484784007437265, "kl": 0.0101470947265625, "learning_rate": 4.989374513109511e-07, "loss": 0.0, "num_tokens": 5097898.0, "reward": 0.0078125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 1023.53125, "completions/mean_terminated_length": 964.0, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.12444444444444444, "grad_norm": 0.6160459637069239, "kl": 0.014614105224609375, "learning_rate": 4.988005917645229e-07, "loss": 0.0, "num_tokens": 5247898.0, "reward": 0.03125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0, "rewards/format_reward_func/std": 0.0, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 1023.5859375, "completions/mean_terminated_length": 971.0, "completions/min_length": 971.0, "completions/min_terminated_length": 971.0, "epoch": 0.128, "grad_norm": 0.9301764376501208, "kl": 0.011962890625, "learning_rate": 4.986554650465906e-07, "loss": 0.0, "num_tokens": 5397937.0, "reward": 0.0546875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 1023.6640625, "completions/mean_terminated_length": 981.0, "completions/min_length": 981.0, "completions/min_terminated_length": 981.0, "epoch": 0.13155555555555556, "grad_norm": 0.69115619959031, "kl": 0.01397705078125, "learning_rate": 4.985020759794397e-07, "loss": 0.0, "num_tokens": 5548058.0, "reward": 0.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.015625, "rewards/format_reward_func/std": 0.12450689822435379, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.1351111111111111, "grad_norm": 0.6884936118028657, "kl": 0.017425537109375, "learning_rate": 4.983404296598978e-07, "loss": 0.0, "num_tokens": 5698150.0, "reward": 0.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.0078125, "rewards/format_reward_func/std": 0.0883883461356163, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.13866666666666666, "grad_norm": 0.5419303034506929, "kl": 0.01865386962890625, "learning_rate": 4.981705314591655e-07, "loss": 0.0, "num_tokens": 5848274.0, "reward": 0.0546875, "reward_std": 0.08295938372612, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.0234375, "rewards/format_reward_func/std": 0.15188287198543549, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 609.0, "completions/mean_length": 1020.7578125, "completions/mean_terminated_length": 609.0, "completions/min_length": 609.0, "completions/min_terminated_length": 609.0, "epoch": 0.14222222222222222, "grad_norm": 0.8101037181683396, "kl": 0.034332275390625, "learning_rate": 4.979923870226372e-07, "loss": 0.0, "num_tokens": 5997935.0, "reward": 0.0859375, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.14577777777777778, "grad_norm": 0.5651823505693436, "kl": 0.03551483154296875, "learning_rate": 4.978060022697148e-07, "loss": 0.0, "num_tokens": 6147995.0, "reward": 0.0703125, "reward_std": 0.1236695945262909, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.03125, "rewards/format_reward_func/std": 0.1746762990951538, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 585.0, "completions/mean_length": 1020.5703125, "completions/mean_terminated_length": 585.0, "completions/min_length": 585.0, "completions/min_terminated_length": 585.0, "epoch": 0.14933333333333335, "grad_norm": 0.6229317404966861, "kl": 0.02642059326171875, "learning_rate": 4.976113833936098e-07, "loss": -0.0017, "num_tokens": 6297660.0, "reward": 0.1015625, "reward_std": 0.1285039782524109, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15288888888888888, "grad_norm": 3.1850672412293557, "kl": 0.2174072265625, "learning_rate": 4.974085368611381e-07, "loss": 0.0002, "num_tokens": 6447644.0, "reward": 0.09375, "reward_std": 0.1478765904903412, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.0390625, "rewards/format_reward_func/std": 0.194504976272583, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.15644444444444444, "grad_norm": 0.9458304222963471, "kl": 0.0660247802734375, "learning_rate": 4.971974694125051e-07, "loss": 0.0001, "num_tokens": 6597752.0, "reward": 0.125, "reward_std": 0.19980770349502563, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.046875, "rewards/format_reward_func/std": 0.21220162510871887, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16, "grad_norm": 0.8447654866514956, "kl": 0.052886962890625, "learning_rate": 4.969781880610813e-07, "loss": 0.0001, "num_tokens": 6747812.0, "reward": 0.140625, "reward_std": 0.2198367863893509, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.0625, "rewards/format_reward_func/std": 0.24301259219646454, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.16355555555555557, "grad_norm": 0.7460914371686507, "kl": 0.035675048828125, "learning_rate": 4.967507000931702e-07, "loss": 0.0, "num_tokens": 6897904.0, "reward": 0.1328125, "reward_std": 0.18418270349502563, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.0703125, "rewards/format_reward_func/std": 0.2566775679588318, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 594.0, "completions/mean_length": 1020.640625, "completions/mean_terminated_length": 594.0, "completions/min_length": 594.0, "completions/min_terminated_length": 594.0, "epoch": 0.1671111111111111, "grad_norm": 0.8337047179802486, "kl": 0.0596771240234375, "learning_rate": 4.965150130677651e-07, "loss": 0.0018, "num_tokens": 7047614.0, "reward": 0.1484375, "reward_std": 0.1947515904903412, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.09375, "rewards/format_reward_func/std": 0.29262590408325195, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17066666666666666, "grad_norm": 1.2713278800415688, "kl": 0.1058349609375, "learning_rate": 4.962711348162987e-07, "loss": 0.0001, "num_tokens": 7197714.0, "reward": 0.265625, "reward_std": 0.3883945047855377, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.1796875, "rewards/format_reward_func/std": 0.3854354918003082, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17422222222222222, "grad_norm": 1.1831547501575272, "kl": 0.0792236328125, "learning_rate": 4.960190734423824e-07, "loss": 0.0001, "num_tokens": 7347754.0, "reward": 0.359375, "reward_std": 0.41409674286842346, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.2578125, "rewards/format_reward_func/std": 0.43914902210235596, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.17777777777777778, "grad_norm": 3.778212762753793, "kl": 0.265594482421875, "learning_rate": 4.957588373215373e-07, "loss": 0.0003, "num_tokens": 7497846.0, "reward": 0.296875, "reward_std": 0.35075798630714417, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.203125, "rewards/format_reward_func/std": 0.40390563011169434, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18133333333333335, "grad_norm": 1.101547603802584, "kl": 0.134552001953125, "learning_rate": 4.954904351009156e-07, "loss": 0.0001, "num_tokens": 7647946.0, "reward": 0.2734375, "reward_std": 0.3065218925476074, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.1875, "rewards/format_reward_func/std": 0.39184603095054626, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18488888888888888, "grad_norm": 1.5978488825312307, "kl": 0.21014404296875, "learning_rate": 4.952138756990142e-07, "loss": 0.0002, "num_tokens": 7797998.0, "reward": 0.34375, "reward_std": 0.4132579565048218, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.2265625, "rewards/format_reward_func/std": 0.4202519655227661, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.18844444444444444, "grad_norm": 512.5598786984092, "kl": 25.5601806640625, "learning_rate": 4.949291683053768e-07, "loss": 0.0256, "num_tokens": 7948058.0, "reward": 0.2578125, "reward_std": 0.32301196455955505, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.1953125, "rewards/format_reward_func/std": 0.3979988098144531, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 561.0, "completions/mean_length": 1020.3828125, "completions/mean_terminated_length": 561.0, "completions/min_length": 561.0, "completions/min_terminated_length": 561.0, "epoch": 0.192, "grad_norm": 1.5564366489631851, "kl": 0.16339111328125, "learning_rate": 4.946363223802901e-07, "loss": 0.0034, "num_tokens": 8097687.0, "reward": 0.3046875, "reward_std": 0.360648512840271, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.2734375, "rewards/format_reward_func/std": 0.447474867105484, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.19555555555555557, "grad_norm": 1.1305452882497922, "kl": 0.10137939453125, "learning_rate": 4.943353476544681e-07, "loss": 0.0001, "num_tokens": 8247723.0, "reward": 0.3828125, "reward_std": 0.3485274910926819, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.296875, "rewards/format_reward_func/std": 0.45867621898651123, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.1991111111111111, "grad_norm": 4.553888254328864, "kl": 0.7073974609375, "learning_rate": 4.940262541287302e-07, "loss": 0.0007, "num_tokens": 8397903.0, "reward": 0.328125, "reward_std": 0.3895031809806824, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.3046875, "rewards/format_reward_func/std": 0.46208351850509644, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20266666666666666, "grad_norm": 1.3614288344904635, "kl": 0.19384765625, "learning_rate": 4.937090520736671e-07, "loss": 0.0002, "num_tokens": 8547907.0, "reward": 0.3515625, "reward_std": 0.36349600553512573, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.2890625, "rewards/format_reward_func/std": 0.45510825514793396, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 990.0, "completions/mean_length": 1023.734375, "completions/mean_terminated_length": 990.0, "completions/min_length": 990.0, "completions/min_terminated_length": 990.0, "epoch": 0.20622222222222222, "grad_norm": 1.5996692345714885, "kl": 0.4195556640625, "learning_rate": 4.933837520293017e-07, "loss": 0.0004, "num_tokens": 8697945.0, "reward": 0.3984375, "reward_std": 0.4999430775642395, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.3125, "rewards/format_reward_func/std": 0.4653336703777313, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.20977777777777779, "grad_norm": 1.6274567171729826, "kl": 0.189208984375, "learning_rate": 4.930503648047367e-07, "loss": 0.0002, "num_tokens": 8848073.0, "reward": 0.4765625, "reward_std": 0.532931923866272, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.3828125, "rewards/format_reward_func/std": 0.4879830479621887, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21333333333333335, "grad_norm": 2.5750129466221963, "kl": 0.19110107421875, "learning_rate": 4.927089014777972e-07, "loss": 0.0002, "num_tokens": 8998121.0, "reward": 0.65625, "reward_std": 0.4891524910926819, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.5390625, "rewards/format_reward_func/std": 0.5004304051399231, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.21688888888888888, "grad_norm": 13.993163769757592, "kl": 0.8134765625, "learning_rate": 4.923593733946614e-07, "loss": 0.0008, "num_tokens": 9148093.0, "reward": 0.546875, "reward_std": 0.4977344870567322, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.4453125, "rewards/format_reward_func/std": 0.4989531338214874, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22044444444444444, "grad_norm": 28.51452434536603, "kl": 1.03125, "learning_rate": 4.920017921694841e-07, "loss": 0.001, "num_tokens": 9298205.0, "reward": 0.578125, "reward_std": 0.5072165727615356, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.4609375, "rewards/format_reward_func/std": 0.5004304051399231, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 1023.203125, "completions/mean_terminated_length": 922.0, "completions/min_length": 922.0, "completions/min_terminated_length": 922.0, "epoch": 0.224, "grad_norm": 301.45171655027264, "kl": 38.72314453125, "learning_rate": 4.91636169684011e-07, "loss": 0.0388, "num_tokens": 9448223.0, "reward": 0.5625, "reward_std": 0.41805291175842285, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.5, "rewards/format_reward_func/std": 0.5019646286964417, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.22755555555555557, "grad_norm": 83.35859886358769, "kl": 1.23046875, "learning_rate": 4.912625180871833e-07, "loss": 0.0012, "num_tokens": 9598335.0, "reward": 0.546875, "reward_std": 0.4918133020401001, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.4921875, "rewards/format_reward_func/std": 0.5019033551216125, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2311111111111111, "grad_norm": 110.4414998353962, "kl": 8.59423828125, "learning_rate": 4.908808497947346e-07, "loss": 0.0086, "num_tokens": 9748463.0, "reward": 0.6953125, "reward_std": 0.5663774013519287, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4997538626194, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23466666666666666, "grad_norm": 15.464985660073177, "kl": 0.44677734375, "learning_rate": 4.904911774887779e-07, "loss": 0.0004, "num_tokens": 9898559.0, "reward": 0.734375, "reward_std": 0.5164287090301514, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.23822222222222222, "grad_norm": 110.62181140841322, "kl": 9.9307861328125, "learning_rate": 4.900935141173842e-07, "loss": 0.01, "num_tokens": 10048631.0, "reward": 0.6171875, "reward_std": 0.5263192653656006, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.5009832978248596, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24177777777777779, "grad_norm": 2.107723454623703, "kl": 0.16064453125, "learning_rate": 4.896878728941531e-07, "loss": 0.0002, "num_tokens": 10198763.0, "reward": 0.71875, "reward_std": 0.5146898627281189, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4997538626194, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 702.0, "completions/mean_length": 1015.9921875, "completions/mean_terminated_length": 511.5, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.24533333333333332, "grad_norm": 2.4276113630388987, "kl": 0.32257080078125, "learning_rate": 4.892742672977722e-07, "loss": 0.001, "num_tokens": 10347810.0, "reward": 0.671875, "reward_std": 0.5014820694923401, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.6015625, "rewards/format_reward_func/std": 0.4915000796318054, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.24888888888888888, "grad_norm": 2.0726095986268063, "kl": 0.3040771484375, "learning_rate": 4.888527110715709e-07, "loss": 0.0003, "num_tokens": 10497858.0, "reward": 0.765625, "reward_std": 0.5184155702590942, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25244444444444447, "grad_norm": 3.220272355144728, "kl": 0.979736328125, "learning_rate": 4.884232182230623e-07, "loss": 0.001, "num_tokens": 10647998.0, "reward": 0.671875, "reward_std": 0.4889262914657593, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.5390625, "rewards/format_reward_func/std": 0.5004304051399231, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 620.0, "completions/mean_length": 1020.84375, "completions/mean_terminated_length": 620.0, "completions/min_length": 620.0, "completions/min_terminated_length": 620.0, "epoch": 0.256, "grad_norm": 29081.71436912935, "kl": 2880.66650390625, "learning_rate": 4.879858030234789e-07, "loss": 2.8709, "num_tokens": 10797698.0, "reward": 0.6484375, "reward_std": 0.5133594274520874, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.5625, "rewards/format_reward_func/std": 0.49802759289741516, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.25955555555555554, "grad_norm": 300.35377327998424, "kl": 17.80126953125, "learning_rate": 4.875404800072976e-07, "loss": 0.0178, "num_tokens": 10947770.0, "reward": 0.7734375, "reward_std": 0.5162069797515869, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26311111111111113, "grad_norm": 3.032454724701209, "kl": 0.6512451171875, "learning_rate": 4.870872639717572e-07, "loss": 0.0007, "num_tokens": 11097754.0, "reward": 0.71875, "reward_std": 0.46359750628471375, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.6015625, "rewards/format_reward_func/std": 0.4915000796318054, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.26666666666666666, "grad_norm": 1.5656427698506488, "kl": 0.2464599609375, "learning_rate": 4.866261699763664e-07, "loss": 0.0002, "num_tokens": 11247918.0, "reward": 0.625, "reward_std": 0.5384665727615356, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.5009832978248596, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2702222222222222, "grad_norm": 1.9911433794988622, "kl": 0.51171875, "learning_rate": 4.861572133424035e-07, "loss": 0.0005, "num_tokens": 11397994.0, "reward": 0.7890625, "reward_std": 0.5294147729873657, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.6484375, "rewards/format_reward_func/std": 0.4793342351913452, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2737777777777778, "grad_norm": 25.646564316760404, "kl": 2.8963623046875, "learning_rate": 4.856804096524078e-07, "loss": 0.0029, "num_tokens": 11548066.0, "reward": 0.7421875, "reward_std": 0.49640408158302307, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.6171875, "rewards/format_reward_func/std": 0.4879830479621887, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2773333333333333, "grad_norm": 16.886722080699315, "kl": 1.9315185546875, "learning_rate": 4.851957747496606e-07, "loss": 0.0019, "num_tokens": 11698114.0, "reward": 0.859375, "reward_std": 0.4702102243900299, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.6875, "rewards/format_reward_func/std": 0.4653336703777313, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2808888888888889, "grad_norm": 2.867512045394932, "kl": 0.683349609375, "learning_rate": 4.847033247376605e-07, "loss": 0.0007, "num_tokens": 11848234.0, "reward": 0.71875, "reward_std": 0.5516130328178406, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.6015625, "rewards/format_reward_func/std": 0.4915000796318054, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 517.0, "completions/mean_length": 1020.0390625, "completions/mean_terminated_length": 517.0, "completions/min_length": 517.0, "completions/min_terminated_length": 517.0, "epoch": 0.28444444444444444, "grad_norm": 930.7599936877901, "kl": 129.28759765625, "learning_rate": 4.842030759795866e-07, "loss": 0.1244, "num_tokens": 11997699.0, "reward": 0.671875, "reward_std": 0.5193157196044922, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.5546875, "rewards/format_reward_func/std": 0.4989531338214874, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.288, "grad_norm": 9.09872297206347, "kl": 2.72705078125, "learning_rate": 4.836950450977558e-07, "loss": 0.0027, "num_tokens": 12147767.0, "reward": 0.6640625, "reward_std": 0.5751680135726929, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.5546875, "rewards/format_reward_func/std": 0.4989531338214874, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.29155555555555557, "grad_norm": 23.68537689251002, "kl": 2.2000732421875, "learning_rate": 4.831792489730703e-07, "loss": 0.0022, "num_tokens": 12297855.0, "reward": 0.6875, "reward_std": 0.5555868148803711, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.4930621087551117, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.2951111111111111, "grad_norm": 96.77596731759, "kl": 12.6279296875, "learning_rate": 4.826557047444563e-07, "loss": 0.0127, "num_tokens": 12447931.0, "reward": 0.7734375, "reward_std": 0.4865091145038605, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.609375, "rewards/format_reward_func/std": 0.4898075461387634, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 1023.7109375, "completions/mean_terminated_length": 987.0, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.2986666666666667, "grad_norm": 36.29043682822812, "kl": 0.850341796875, "learning_rate": 4.821244298082951e-07, "loss": 0.0012, "num_tokens": 12598042.0, "reward": 0.59375, "reward_std": 0.531623363494873, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.46875, "rewards/format_reward_func/std": 0.5009832978248596, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3022222222222222, "grad_norm": 15.946605518510674, "kl": 0.48291015625, "learning_rate": 4.815854418178445e-07, "loss": 0.0005, "num_tokens": 12748178.0, "reward": 0.6328125, "reward_std": 0.4821094870567322, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.515625, "rewards/format_reward_func/std": 0.5017194747924805, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30577777777777776, "grad_norm": 32.36258194958696, "kl": 3.39697265625, "learning_rate": 4.810387586826527e-07, "loss": 0.0034, "num_tokens": 12898234.0, "reward": 0.796875, "reward_std": 0.45792436599731445, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.65625, "rewards/format_reward_func/std": 0.47682511806488037, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.30933333333333335, "grad_norm": 139.89134570811942, "kl": 5.79541015625, "learning_rate": 4.804843985679626e-07, "loss": 0.0058, "num_tokens": 13048270.0, "reward": 0.640625, "reward_std": 0.4607718586921692, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.5390625, "rewards/format_reward_func/std": 0.5004304051399231, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 1021.375, "completions/mean_terminated_length": 688.0, "completions/min_length": 688.0, "completions/min_terminated_length": 688.0, "epoch": 0.3128888888888889, "grad_norm": 54.67143909719586, "kl": 1.61669921875, "learning_rate": 4.799223798941089e-07, "loss": 0.0016, "num_tokens": 13198062.0, "reward": 0.4296875, "reward_std": 0.42050957679748535, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4860251843929291, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9765625, "completions/max_length": 1024.0, "completions/max_terminated_length": 738.0, "completions/mean_length": 1013.3515625, "completions/mean_terminated_length": 569.6666870117188, "completions/min_length": 458.0, "completions/min_terminated_length": 458.0, "epoch": 0.3164444444444444, "grad_norm": 57.35466003205999, "kl": 0.3779296875, "learning_rate": 4.793527213359058e-07, "loss": 0.0031, "num_tokens": 13346787.0, "reward": 0.5390625, "reward_std": 0.49795621633529663, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.4375, "rewards/format_reward_func/std": 0.49802759289741516, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 173.0, "completions/mean_length": 1017.3515625, "completions/mean_terminated_length": 173.0, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.32, "grad_norm": 36.864741914986865, "kl": 0.89306640625, "learning_rate": 4.787754418220257e-07, "loss": 0.0072, "num_tokens": 13496060.0, "reward": 0.5390625, "reward_std": 0.44822055101394653, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.4453125, "rewards/format_reward_func/std": 0.4989531338214874, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 562.0, "completions/mean_length": 1020.390625, "completions/mean_terminated_length": 562.0, "completions/min_length": 562.0, "completions/min_terminated_length": 562.0, "epoch": 0.32355555555555554, "grad_norm": 3.2622922916572046, "kl": 0.6884765625, "learning_rate": 4.781905605343716e-07, "loss": -0.0049, "num_tokens": 13645670.0, "reward": 0.5390625, "reward_std": 0.49795621633529663, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.453125, "rewards/format_reward_func/std": 0.4997538626194, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.32711111111111113, "grad_norm": 11.617829398102751, "kl": 2.239990234375, "learning_rate": 4.775980969074385e-07, "loss": 0.0022, "num_tokens": 13795814.0, "reward": 0.484375, "reward_std": 0.43988215923309326, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.3984375, "rewards/format_reward_func/std": 0.4915000796318054, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 1023.53125, "completions/mean_terminated_length": 964.0, "completions/min_length": 964.0, "completions/min_terminated_length": 964.0, "epoch": 0.33066666666666666, "grad_norm": 4.596959167018736, "kl": 0.736083984375, "learning_rate": 4.769980706276687e-07, "loss": 0.0009, "num_tokens": 13945798.0, "reward": 0.421875, "reward_std": 0.44934237003326416, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.375, "rewards/format_reward_func/std": 0.4860251843929291, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3342222222222222, "grad_norm": 10.644076195795904, "kl": 1.4736328125, "learning_rate": 4.7639050163279646e-07, "loss": 0.0015, "num_tokens": 14095922.0, "reward": 0.4921875, "reward_std": 0.4821094870567322, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.453125, "rewards/format_reward_func/std": 0.4997538626194, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 977.0, "completions/mean_length": 1023.6328125, "completions/mean_terminated_length": 977.0, "completions/min_length": 977.0, "completions/min_terminated_length": 977.0, "epoch": 0.3377777777777778, "grad_norm": 2.19242233514565, "kl": 0.3614501953125, "learning_rate": 4.757754101111867e-07, "loss": 0.0001, "num_tokens": 14245959.0, "reward": 0.4765625, "reward_std": 0.47088858485221863, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.4296875, "rewards/format_reward_func/std": 0.4969765841960907, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 1020.71875, "completions/mean_terminated_length": 604.0, "completions/min_length": 604.0, "completions/min_terminated_length": 604.0, "epoch": 0.3413333333333333, "grad_norm": 2.107237251256641, "kl": 0.3170166015625, "learning_rate": 4.751528165011633e-07, "loss": 0.0032, "num_tokens": 14395647.0, "reward": 0.453125, "reward_std": 0.450203001499176, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.421875, "rewards/format_reward_func/std": 0.4957992732524872, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3448888888888889, "grad_norm": 30.971281022996752, "kl": 4.0438232421875, "learning_rate": 4.7452274149033036e-07, "loss": 0.004, "num_tokens": 14545723.0, "reward": 0.5625, "reward_std": 0.44865086674690247, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.5078125, "rewards/format_reward_func/std": 0.5019033551216125, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.34844444444444445, "grad_norm": 7.015689338791732, "kl": 1.017822265625, "learning_rate": 4.738852060148848e-07, "loss": 0.001, "num_tokens": 14695879.0, "reward": 0.578125, "reward_std": 0.5111508369445801, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.5078125, "rewards/format_reward_func/std": 0.5019033551216125, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.352, "grad_norm": 1.6092633309894269, "kl": 0.4576416015625, "learning_rate": 4.7324023125892067e-07, "loss": 0.0005, "num_tokens": 14845907.0, "reward": 0.65625, "reward_std": 0.515102744102478, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.5546875, "rewards/format_reward_func/std": 0.4989531338214874, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.35555555555555557, "grad_norm": 1.5717974590084631, "kl": 0.349609375, "learning_rate": 4.7258783865372496e-07, "loss": 0.0003, "num_tokens": 14995919.0, "reward": 0.625, "reward_std": 0.47661858797073364, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.5009832978248596, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3591111111111111, "grad_norm": 23.661691592795993, "kl": 2.828369140625, "learning_rate": 4.719280498770659e-07, "loss": 0.0028, "num_tokens": 15145999.0, "reward": 0.6875, "reward_std": 0.47596654295921326, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4997538626194, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3626666666666667, "grad_norm": 1.4682398657142082, "kl": 0.357666015625, "learning_rate": 4.712608868524726e-07, "loss": 0.0004, "num_tokens": 15296055.0, "reward": 0.5859375, "reward_std": 0.4858570694923401, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.546875, "rewards/format_reward_func/std": 0.4997538626194, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3662222222222222, "grad_norm": 1.9453823984626142, "kl": 0.57861328125, "learning_rate": 4.70586371748506e-07, "loss": 0.0006, "num_tokens": 15446147.0, "reward": 0.59375, "reward_std": 0.5421922206878662, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.53125, "rewards/format_reward_func/std": 0.5009832978248596, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.36977777777777776, "grad_norm": 1.4043736354809186, "kl": 0.3076171875, "learning_rate": 4.699045269780232e-07, "loss": 0.0003, "num_tokens": 15596243.0, "reward": 0.6328125, "reward_std": 0.5181938409805298, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.5546875, "rewards/format_reward_func/std": 0.4989531338214874, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.37333333333333335, "grad_norm": 1.992727449650588, "kl": 0.46435546875, "learning_rate": 4.692153751974318e-07, "loss": 0.0005, "num_tokens": 15746295.0, "reward": 0.7265625, "reward_std": 0.4990648627281189, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.59375, "rewards/format_reward_func/std": 0.4930621087551117, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3768888888888889, "grad_norm": 1.4530874888780143, "kl": 0.2540283203125, "learning_rate": 4.685189393059377e-07, "loss": 0.0003, "num_tokens": 15896379.0, "reward": 0.7578125, "reward_std": 0.47285783290863037, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.6171875, "rewards/format_reward_func/std": 0.4879830479621887, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3804444444444444, "grad_norm": 1.3184429417396186, "kl": 0.2779541015625, "learning_rate": 4.6781524244478374e-07, "loss": 0.0003, "num_tokens": 16046499.0, "reward": 0.7265625, "reward_std": 0.44229933619499207, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.625, "rewards/format_reward_func/std": 0.4860251843929291, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 936.0, "completions/mean_length": 1023.3125, "completions/mean_terminated_length": 936.0, "completions/min_length": 936.0, "completions/min_terminated_length": 936.0, "epoch": 0.384, "grad_norm": 1.5620581101640454, "kl": 0.4827880859375, "learning_rate": 4.6710430799648143e-07, "loss": -0.0005, "num_tokens": 16196443.0, "reward": 0.828125, "reward_std": 0.3753952980041504, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.75, "rewards/format_reward_func/std": 0.434714138507843, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.38755555555555554, "grad_norm": 1.5234667233107693, "kl": 0.364501953125, "learning_rate": 4.663861595840332e-07, "loss": 0.0004, "num_tokens": 16346535.0, "reward": 0.6640625, "reward_std": 0.5456962585449219, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.578125, "rewards/format_reward_func/std": 0.4957992732524872, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39111111111111113, "grad_norm": 1.1030778627931277, "kl": 0.3199462890625, "learning_rate": 4.6566082107014795e-07, "loss": 0.0003, "num_tokens": 16496607.0, "reward": 0.71875, "reward_std": 0.4209398627281189, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.6484375, "rewards/format_reward_func/std": 0.4793342351913452, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.39466666666666667, "grad_norm": 2.1973527940574638, "kl": 0.65673828125, "learning_rate": 4.649283165564479e-07, "loss": 0.0007, "num_tokens": 16646699.0, "reward": 0.7734375, "reward_std": 0.38354694843292236, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.703125, "rewards/format_reward_func/std": 0.45867621898651123, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.3982222222222222, "grad_norm": 1.3373220551453864, "kl": 0.318359375, "learning_rate": 4.6418867038266807e-07, "loss": 0.0003, "num_tokens": 16796751.0, "reward": 0.8046875, "reward_std": 0.4354780912399292, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.734375, "rewards/format_reward_func/std": 0.44340085983276367, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4017777777777778, "grad_norm": 1.2647169648626801, "kl": 0.2821044921875, "learning_rate": 4.6344190712584713e-07, "loss": 0.0003, "num_tokens": 16946835.0, "reward": 0.703125, "reward_std": 0.4077320694923401, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.6796875, "rewards/format_reward_func/std": 0.4684300124645233, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4053333333333333, "grad_norm": 1.1878240795829997, "kl": 0.361328125, "learning_rate": 4.6268805159951086e-07, "loss": 0.0004, "num_tokens": 17096947.0, "reward": 0.8671875, "reward_std": 0.3113781809806824, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.8046875, "rewards/format_reward_func/std": 0.3979988098144531, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4088888888888889, "grad_norm": 1.8257452561603271, "kl": 0.524169921875, "learning_rate": 4.619271288528478e-07, "loss": 0.0005, "num_tokens": 17247139.0, "reward": 0.875, "reward_std": 0.26230770349502563, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3645188808441162, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41244444444444445, "grad_norm": 0.9523432216519059, "kl": 0.240234375, "learning_rate": 4.611591641698768e-07, "loss": 0.0002, "num_tokens": 17397151.0, "reward": 0.890625, "reward_std": 0.2728765904903412, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.8515625, "rewards/format_reward_func/std": 0.356930136680603, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.416, "grad_norm": 95.2026850158686, "kl": 19.8927001953125, "learning_rate": 4.6038418306860695e-07, "loss": 0.0199, "num_tokens": 17547215.0, "reward": 1.0, "reward_std": 0.28053662180900574, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.8828125, "rewards/format_reward_func/std": 0.322907418012619, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.41955555555555557, "grad_norm": 0.9439098085926358, "kl": 0.3033447265625, "learning_rate": 4.596022113001894e-07, "loss": 0.0003, "num_tokens": 17697291.0, "reward": 0.9609375, "reward_std": 0.2959749102592468, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.8671875, "rewards/format_reward_func/std": 0.3407054841518402, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4231111111111111, "grad_norm": 1.5034266529543672, "kl": 0.43603515625, "learning_rate": 4.58813274848062e-07, "loss": 0.0004, "num_tokens": 17847479.0, "reward": 0.875, "reward_std": 0.29091876745224, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.84375, "rewards/format_reward_func/std": 0.3645188808441162, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4266666666666667, "grad_norm": 0.9097105807939918, "kl": 0.2958984375, "learning_rate": 4.5801739992708604e-07, "loss": 0.0003, "num_tokens": 17997555.0, "reward": 1.1015625, "reward_std": 0.23920938372612, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.8671875, "rewards/format_reward_func/std": 0.3407054841518402, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43022222222222223, "grad_norm": 1.0539526771266905, "kl": 0.3331298828125, "learning_rate": 4.572146129826746e-07, "loss": 0.0003, "num_tokens": 18147663.0, "reward": 0.9765625, "reward_std": 0.229935884475708, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.875, "rewards/format_reward_func/std": 0.3320184051990509, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43377777777777776, "grad_norm": 1.0629899127803644, "kl": 0.380126953125, "learning_rate": 4.5640494068991454e-07, "loss": 0.0004, "num_tokens": 18297743.0, "reward": 0.921875, "reward_std": 0.21037657558918, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.890625, "rewards/format_reward_func/std": 0.31333550810813904, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.43733333333333335, "grad_norm": 0.6726093250836683, "kl": 0.2476806640625, "learning_rate": 4.555884099526793e-07, "loss": 0.0002, "num_tokens": 18447847.0, "reward": 0.9609375, "reward_std": 0.19847732782363892, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4408888888888889, "grad_norm": 1.480783347656319, "kl": 0.57763671875, "learning_rate": 4.547650479027361e-07, "loss": 0.0006, "num_tokens": 18597891.0, "reward": 1.0546875, "reward_std": 0.24404376745224, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4444444444444444, "grad_norm": 0.6920213585959825, "kl": 0.2728271484375, "learning_rate": 4.53934881898843e-07, "loss": 0.0003, "num_tokens": 18747955.0, "reward": 1.0703125, "reward_std": 0.18154376745224, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.448, "grad_norm": 0.6985246612714577, "kl": 0.302978515625, "learning_rate": 4.5309793952584095e-07, "loss": 0.0003, "num_tokens": 18898051.0, "reward": 1.140625, "reward_std": 0.19233438372612, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45155555555555554, "grad_norm": 0.7076159190483788, "kl": 0.339599609375, "learning_rate": 4.5225424859373684e-07, "loss": 0.0003, "num_tokens": 19048119.0, "reward": 0.96875, "reward_std": 0.17912657558918, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45511111111111113, "grad_norm": 10.875869465991155, "kl": 1.052001953125, "learning_rate": 4.514038371367791e-07, "loss": 0.0011, "num_tokens": 19198191.0, "reward": 1.0390625, "reward_std": 0.15866719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.45866666666666667, "grad_norm": 1.1831682236949836, "kl": 0.614501953125, "learning_rate": 4.5054673341252657e-07, "loss": 0.0006, "num_tokens": 19348283.0, "reward": 1.0078125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4622222222222222, "grad_norm": 0.7113087356983967, "kl": 0.305908203125, "learning_rate": 4.496829659009095e-07, "loss": 0.0003, "num_tokens": 19498451.0, "reward": 1.0234375, "reward_std": 0.20222491025924683, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9140625, "rewards/format_reward_func/std": 0.2813730239868164, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4657777777777778, "grad_norm": 0.8374443599968336, "kl": 0.35009765625, "learning_rate": 4.488125633032831e-07, "loss": 0.0004, "num_tokens": 19648579.0, "reward": 1.0546875, "reward_std": 0.15293270349502563, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4693333333333333, "grad_norm": 0.7971133008611588, "kl": 0.2835693359375, "learning_rate": 4.479355545414738e-07, "loss": 0.0003, "num_tokens": 19798627.0, "reward": 0.9765625, "reward_std": 0.20705929398536682, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.90625, "rewards/format_reward_func/std": 0.29262590408325195, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4728888888888889, "grad_norm": 0.7807611185688261, "kl": 0.2840576171875, "learning_rate": 4.470519687568185e-07, "loss": 0.0003, "num_tokens": 19948683.0, "reward": 1.125, "reward_std": 0.22841878235340118, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.47644444444444445, "grad_norm": 0.49054869618998437, "kl": 0.247802734375, "learning_rate": 4.4616183530919604e-07, "loss": 0.0002, "num_tokens": 20098791.0, "reward": 0.9921875, "reward_std": 0.11420939117670059, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48, "grad_norm": 0.4748347842749267, "kl": 0.2689208984375, "learning_rate": 4.452651837760515e-07, "loss": 0.0003, "num_tokens": 20248907.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.48355555555555557, "grad_norm": 0.9186765694622445, "kl": 0.377685546875, "learning_rate": 4.443620439514138e-07, "loss": 0.0004, "num_tokens": 20398927.0, "reward": 1.0234375, "reward_std": 0.23830929398536682, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9140625, "rewards/format_reward_func/std": 0.2813730239868164, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.4871111111111111, "grad_norm": 0.6352333602375103, "kl": 0.2947998046875, "learning_rate": 4.4345244584490535e-07, "loss": 0.0003, "num_tokens": 20548995.0, "reward": 1.046875, "reward_std": 0.16108438372612, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49066666666666664, "grad_norm": 0.6182942989959218, "kl": 0.283447265625, "learning_rate": 4.4253641968074505e-07, "loss": 0.0003, "num_tokens": 20699135.0, "reward": 1.0078125, "reward_std": 0.15866719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49422222222222223, "grad_norm": 0.9503041266562497, "kl": 0.360595703125, "learning_rate": 4.41613995896744e-07, "loss": 0.0004, "num_tokens": 20849275.0, "reward": 1.078125, "reward_std": 0.13730771839618683, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9375, "rewards/format_reward_func/std": 0.24301259219646454, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.49777777777777776, "grad_norm": 0.5024213557444295, "kl": 0.227294921875, "learning_rate": 4.40685205143294e-07, "loss": 0.0002, "num_tokens": 20999471.0, "reward": 0.9609375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0078125, "rewards/equation_reward_func/std": 0.0883883461356163, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5013333333333333, "grad_norm": 0.9322239720472953, "kl": 0.2623291015625, "learning_rate": 4.3975007828234914e-07, "loss": 0.0003, "num_tokens": 21149583.0, "reward": 1.0234375, "reward_std": 0.1947515904903412, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.921875, "rewards/format_reward_func/std": 0.2694226801395416, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5048888888888889, "grad_norm": 0.6064606330844944, "kl": 0.2518310546875, "learning_rate": 4.3880864638640035e-07, "loss": 0.0003, "num_tokens": 21299647.0, "reward": 1.0859375, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5084444444444445, "grad_norm": 0.6790833137997374, "kl": 0.2943115234375, "learning_rate": 4.37860940737443e-07, "loss": 0.0003, "num_tokens": 21449683.0, "reward": 1.0703125, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.512, "grad_norm": 0.5261215426559341, "kl": 0.27001953125, "learning_rate": 4.3690699282593723e-07, "loss": 0.0003, "num_tokens": 21599703.0, "reward": 1.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5155555555555555, "grad_norm": 0.8026307382152816, "kl": 0.32861328125, "learning_rate": 4.3594683434976186e-07, "loss": 0.0003, "num_tokens": 21749791.0, "reward": 1.0859375, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5191111111111111, "grad_norm": 0.7172657046843014, "kl": 0.2279052734375, "learning_rate": 4.3498049721316087e-07, "loss": 0.0002, "num_tokens": 21899803.0, "reward": 1.1796875, "reward_std": 0.17670938372612, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5226666666666666, "grad_norm": 1.204155012545121, "kl": 0.29345703125, "learning_rate": 4.340080135256835e-07, "loss": 0.0003, "num_tokens": 22049907.0, "reward": 1.03125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5262222222222223, "grad_norm": 0.6662161174116621, "kl": 0.31396484375, "learning_rate": 4.3302941560111716e-07, "loss": 0.0003, "num_tokens": 22200039.0, "reward": 1.0234375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5297777777777778, "grad_norm": 0.4687551207366821, "kl": 0.2490234375, "learning_rate": 4.3204473595641367e-07, "loss": 0.0002, "num_tokens": 22350107.0, "reward": 1.0546875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5333333333333333, "grad_norm": 0.7165895343408674, "kl": 0.2591552734375, "learning_rate": 4.3105400731060896e-07, "loss": 0.0003, "num_tokens": 22500175.0, "reward": 1.0703125, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5368888888888889, "grad_norm": 4.77464698740712, "kl": 0.590087890625, "learning_rate": 4.300572625837359e-07, "loss": 0.0006, "num_tokens": 22650231.0, "reward": 1.1328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5404444444444444, "grad_norm": 2.067333654669863, "kl": 0.4598388671875, "learning_rate": 4.2905453489573007e-07, "loss": 0.0005, "num_tokens": 22800291.0, "reward": 1.1171875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.544, "grad_norm": 1.0119921371638605, "kl": 0.4427490234375, "learning_rate": 4.280458575653296e-07, "loss": 0.0004, "num_tokens": 22950419.0, "reward": 1.03125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5475555555555556, "grad_norm": 1.2144184962424178, "kl": 0.333984375, "learning_rate": 4.2703126410896815e-07, "loss": 0.0003, "num_tokens": 23100491.0, "reward": 1.015625, "reward_std": 0.12983438372612, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5511111111111111, "grad_norm": 0.8321316802162844, "kl": 0.2813720703125, "learning_rate": 4.2601078823966065e-07, "loss": 0.0003, "num_tokens": 23250627.0, "reward": 1.03125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5546666666666666, "grad_norm": 1.4648287906993593, "kl": 0.41845703125, "learning_rate": 4.249844638658837e-07, "loss": 0.0004, "num_tokens": 23400683.0, "reward": 1.03125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5582222222222222, "grad_norm": 0.7661892207152143, "kl": 0.3206787109375, "learning_rate": 4.2395232509044856e-07, "loss": 0.0003, "num_tokens": 23550755.0, "reward": 1.03125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5617777777777778, "grad_norm": 0.597016808098165, "kl": 0.3602294921875, "learning_rate": 4.229144062093679e-07, "loss": 0.0004, "num_tokens": 23700911.0, "reward": 1.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5653333333333334, "grad_norm": 0.7108571308176221, "kl": 0.40673828125, "learning_rate": 4.218707417107166e-07, "loss": 0.0004, "num_tokens": 23851015.0, "reward": 1.0234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5688888888888889, "grad_norm": 3.002144453280244, "kl": 0.3819580078125, "learning_rate": 4.208213662734852e-07, "loss": 0.0004, "num_tokens": 24001123.0, "reward": 1.046875, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5724444444444444, "grad_norm": 1.040561593144266, "kl": 0.3697509765625, "learning_rate": 4.197663147664281e-07, "loss": 0.0004, "num_tokens": 24151183.0, "reward": 1.0859375, "reward_std": 0.140625, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.576, "grad_norm": 993.4036541954849, "kl": 32.2384033203125, "learning_rate": 4.187056222469046e-07, "loss": 0.0323, "num_tokens": 24301287.0, "reward": 1.0703125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5795555555555556, "grad_norm": 0.5337520238294831, "kl": 0.4610595703125, "learning_rate": 4.1763932395971433e-07, "loss": 0.0005, "num_tokens": 24451411.0, "reward": 1.0546875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5831111111111111, "grad_norm": 0.9254115340421442, "kl": 0.3011474609375, "learning_rate": 4.1656745533592565e-07, "loss": 0.0003, "num_tokens": 24601487.0, "reward": 1.0546875, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5866666666666667, "grad_norm": 0.4065046820030453, "kl": 0.2613525390625, "learning_rate": 4.1549005199169887e-07, "loss": 0.0003, "num_tokens": 24751575.0, "reward": 1.0703125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5902222222222222, "grad_norm": 0.3834959108382947, "kl": 0.250244140625, "learning_rate": 4.1440714972710245e-07, "loss": 0.0003, "num_tokens": 24901539.0, "reward": 1.0625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5937777777777777, "grad_norm": 0.5413146965143051, "kl": 0.25, "learning_rate": 4.1331878452492366e-07, "loss": 0.0002, "num_tokens": 25051667.0, "reward": 1.046875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.5973333333333334, "grad_norm": 0.3130707621236147, "kl": 0.2958984375, "learning_rate": 4.122249925494726e-07, "loss": 0.0003, "num_tokens": 25201827.0, "reward": 1.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6008888888888889, "grad_norm": 2873.3478631996863, "kl": 163.21337890625, "learning_rate": 4.111258101453809e-07, "loss": 0.1633, "num_tokens": 25351931.0, "reward": 1.1015625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6044444444444445, "grad_norm": 0.6050607107434486, "kl": 0.2415771484375, "learning_rate": 4.10021273836394e-07, "loss": 0.0002, "num_tokens": 25502083.0, "reward": 1.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.608, "grad_norm": 0.3859043897537288, "kl": 0.2730712890625, "learning_rate": 4.0891142032415717e-07, "loss": 0.0003, "num_tokens": 25652119.0, "reward": 1.0703125, "reward_std": 0.07239051908254623, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6115555555555555, "grad_norm": 0.5858198998360575, "kl": 0.28515625, "learning_rate": 4.0779628648699647e-07, "loss": 0.0003, "num_tokens": 25802235.0, "reward": 1.1171875, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6151111111111112, "grad_norm": 0.7741364709326536, "kl": 0.2783203125, "learning_rate": 4.066759093786931e-07, "loss": 0.0003, "num_tokens": 25952323.0, "reward": 1.109375, "reward_std": 0.17429219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6186666666666667, "grad_norm": 0.5663990950320411, "kl": 0.2496337890625, "learning_rate": 4.055503262272521e-07, "loss": 0.0002, "num_tokens": 26102343.0, "reward": 1.0859375, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6222222222222222, "grad_norm": 0.39832906260017886, "kl": 0.2906494140625, "learning_rate": 4.044195744336656e-07, "loss": 0.0003, "num_tokens": 26252447.0, "reward": 1.1484375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6257777777777778, "grad_norm": 0.7172526406627818, "kl": 0.3323974609375, "learning_rate": 4.0328369157066975e-07, "loss": 0.0003, "num_tokens": 26402459.0, "reward": 1.0234375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6293333333333333, "grad_norm": 0.629907862678644, "kl": 0.4224853515625, "learning_rate": 4.021427153814965e-07, "loss": 0.0004, "num_tokens": 26552515.0, "reward": 1.125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6328888888888888, "grad_norm": 0.47073355016921764, "kl": 0.269287109375, "learning_rate": 4.009966837786194e-07, "loss": 0.0003, "num_tokens": 26702639.0, "reward": 1.1171875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6364444444444445, "grad_norm": 0.47169497904708996, "kl": 0.2578125, "learning_rate": 3.9984563484249355e-07, "loss": 0.0003, "num_tokens": 26852683.0, "reward": 1.1015625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.64, "grad_norm": 116.97688444343322, "kl": 25.215576171875, "learning_rate": 3.98689606820291e-07, "loss": 0.0253, "num_tokens": 27002751.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6435555555555555, "grad_norm": 0.6001652584267132, "kl": 0.255615234375, "learning_rate": 3.975286381246288e-07, "loss": 0.0003, "num_tokens": 27152923.0, "reward": 1.046875, "reward_std": 0.16108438372612, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6471111111111111, "grad_norm": 0.5537109554444567, "kl": 0.2708740234375, "learning_rate": 3.963627673322936e-07, "loss": 0.0003, "num_tokens": 27302987.0, "reward": 1.109375, "reward_std": 0.1441289782524109, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6506666666666666, "grad_norm": 0.31622311814010917, "kl": 0.24267578125, "learning_rate": 3.951920331829592e-07, "loss": 0.0002, "num_tokens": 27452995.0, "reward": 1.1171875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6542222222222223, "grad_norm": 0.8000561944357085, "kl": 0.313232421875, "learning_rate": 3.9401647457789977e-07, "loss": 0.0003, "num_tokens": 27603067.0, "reward": 1.0546875, "reward_std": 0.0924195945262909, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6577777777777778, "grad_norm": 0.45643202860327076, "kl": 0.284912109375, "learning_rate": 3.9283613057869683e-07, "loss": 0.0003, "num_tokens": 27753147.0, "reward": 1.046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6613333333333333, "grad_norm": 0.39173124733608977, "kl": 0.28662109375, "learning_rate": 3.9165104040594144e-07, "loss": 0.0003, "num_tokens": 27903135.0, "reward": 1.03125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6648888888888889, "grad_norm": 0.26406906444392514, "kl": 0.2335205078125, "learning_rate": 3.9046124343793104e-07, "loss": 0.0002, "num_tokens": 28053295.0, "reward": 1.0390625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6684444444444444, "grad_norm": 0.6778240086428492, "kl": 0.2813720703125, "learning_rate": 3.8926677920936093e-07, "loss": 0.0003, "num_tokens": 28203379.0, "reward": 1.0703125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.672, "grad_norm": 0.5927557297342876, "kl": 0.270263671875, "learning_rate": 3.880676874100106e-07, "loss": 0.0003, "num_tokens": 28353463.0, "reward": 1.0390625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6755555555555556, "grad_norm": 1.5359774358122287, "kl": 0.5703125, "learning_rate": 3.868640078834251e-07, "loss": 0.0006, "num_tokens": 28503623.0, "reward": 1.125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6791111111111111, "grad_norm": 1.6209411796594706, "kl": 0.721923828125, "learning_rate": 3.856557806255907e-07, "loss": 0.0007, "num_tokens": 28653691.0, "reward": 1.1328125, "reward_std": 0.140625, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6826666666666666, "grad_norm": 0.5597984973035186, "kl": 0.3275146484375, "learning_rate": 3.844430457836064e-07, "loss": 0.0003, "num_tokens": 28803759.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6862222222222222, "grad_norm": 0.4121539259573786, "kl": 0.2713623046875, "learning_rate": 3.8322584365434934e-07, "loss": 0.0003, "num_tokens": 28953867.0, "reward": 1.1953125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6897777777777778, "grad_norm": 0.3638528130149371, "kl": 0.26220703125, "learning_rate": 3.8200421468313646e-07, "loss": 0.0003, "num_tokens": 29103995.0, "reward": 1.0390625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6933333333333334, "grad_norm": 0.5352989719785112, "kl": 0.245849609375, "learning_rate": 3.807781994623802e-07, "loss": 0.0002, "num_tokens": 29254123.0, "reward": 1.1796875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.6968888888888889, "grad_norm": 0.2386084759260203, "kl": 0.2442626953125, "learning_rate": 3.7954783873023946e-07, "loss": 0.0002, "num_tokens": 29404279.0, "reward": 1.125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7004444444444444, "grad_norm": 0.5500004640424003, "kl": 0.3114013671875, "learning_rate": 3.7831317336926674e-07, "loss": 0.0003, "num_tokens": 29554443.0, "reward": 1.0234375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.704, "grad_norm": 0.30412427638116296, "kl": 0.2391357421875, "learning_rate": 3.7707424440504863e-07, "loss": 0.0002, "num_tokens": 29704463.0, "reward": 1.0625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7075555555555556, "grad_norm": 0.32738920829851637, "kl": 0.2589111328125, "learning_rate": 3.758310930048436e-07, "loss": 0.0003, "num_tokens": 29854495.0, "reward": 1.1328125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7111111111111111, "grad_norm": 0.3381279123701687, "kl": 0.246337890625, "learning_rate": 3.7458376047621356e-07, "loss": 0.0002, "num_tokens": 30004527.0, "reward": 1.0390625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7146666666666667, "grad_norm": 0.3394358229660247, "kl": 0.251220703125, "learning_rate": 3.733322882656511e-07, "loss": 0.0003, "num_tokens": 30154627.0, "reward": 1.1328125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7182222222222222, "grad_norm": 0.4166513868673647, "kl": 0.2821044921875, "learning_rate": 3.7207671795720296e-07, "loss": 0.0003, "num_tokens": 30304699.0, "reward": 1.140625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7217777777777777, "grad_norm": 1.051397040881494, "kl": 0.346435546875, "learning_rate": 3.7081709127108767e-07, "loss": 0.0003, "num_tokens": 30454751.0, "reward": 1.171875, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7253333333333334, "grad_norm": 0.401584977465906, "kl": 0.260986328125, "learning_rate": 3.695534500623096e-07, "loss": 0.0003, "num_tokens": 30604859.0, "reward": 1.03125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7288888888888889, "grad_norm": 0.5571028718632256, "kl": 0.267822265625, "learning_rate": 3.68285836319268e-07, "loss": 0.0003, "num_tokens": 30754963.0, "reward": 1.0546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7324444444444445, "grad_norm": 0.8114453628701095, "kl": 0.329833984375, "learning_rate": 3.6701429216236204e-07, "loss": 0.0003, "num_tokens": 30904995.0, "reward": 1.078125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.736, "grad_norm": 5.430747811736796, "kl": 0.7723388671875, "learning_rate": 3.657388598425908e-07, "loss": 0.0008, "num_tokens": 31055031.0, "reward": 1.0859375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7395555555555555, "grad_norm": 0.4335747978673317, "kl": 0.2557373046875, "learning_rate": 3.644595817401501e-07, "loss": 0.0003, "num_tokens": 31205099.0, "reward": 1.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7431111111111111, "grad_norm": 0.29724509346477174, "kl": 0.302490234375, "learning_rate": 3.631765003630233e-07, "loss": 0.0003, "num_tokens": 31355123.0, "reward": 1.15625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7466666666666667, "grad_norm": 0.40646811756573753, "kl": 0.2509765625, "learning_rate": 3.6188965834556964e-07, "loss": 0.0003, "num_tokens": 31505239.0, "reward": 1.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7502222222222222, "grad_norm": 0.37868160039206916, "kl": 0.2374267578125, "learning_rate": 3.605990984471073e-07, "loss": 0.0002, "num_tokens": 31655311.0, "reward": 1.1484375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7537777777777778, "grad_norm": 0.49494706937034993, "kl": 0.267578125, "learning_rate": 3.5930486355049254e-07, "loss": 0.0003, "num_tokens": 31805475.0, "reward": 1.1015625, "reward_std": 0.10364051908254623, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7573333333333333, "grad_norm": 0.4673717767937372, "kl": 0.241455078125, "learning_rate": 3.580069966606949e-07, "loss": 0.0002, "num_tokens": 31955647.0, "reward": 1.0625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7608888888888888, "grad_norm": 0.48360098551929265, "kl": 0.267333984375, "learning_rate": 3.5670554090336804e-07, "loss": 0.0003, "num_tokens": 32105699.0, "reward": 1.1640625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7644444444444445, "grad_norm": 0.4246673692632073, "kl": 0.25, "learning_rate": 3.55400539523417e-07, "loss": 0.0003, "num_tokens": 32255815.0, "reward": 1.1875, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.768, "grad_norm": 0.36648738044934387, "kl": 0.2562255859375, "learning_rate": 3.5409203588356096e-07, "loss": 0.0003, "num_tokens": 32405879.0, "reward": 1.1640625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7715555555555556, "grad_norm": 0.35224290073179926, "kl": 0.293212890625, "learning_rate": 3.527800734628927e-07, "loss": 0.0003, "num_tokens": 32555947.0, "reward": 1.140625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7751111111111111, "grad_norm": 0.4512547977698612, "kl": 0.254150390625, "learning_rate": 3.5146469585543386e-07, "loss": 0.0003, "num_tokens": 32706031.0, "reward": 1.1328125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7786666666666666, "grad_norm": 0.38043876046176694, "kl": 0.2735595703125, "learning_rate": 3.501459467686859e-07, "loss": 0.0003, "num_tokens": 32856059.0, "reward": 1.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7822222222222223, "grad_norm": 0.5631002431510057, "kl": 0.2421875, "learning_rate": 3.4882387002217837e-07, "loss": 0.0002, "num_tokens": 33006239.0, "reward": 1.0234375, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7857777777777778, "grad_norm": 0.9474673348489613, "kl": 0.3988037109375, "learning_rate": 3.474985095460127e-07, "loss": 0.0004, "num_tokens": 33156319.0, "reward": 1.1171875, "reward_std": 0.14545938372612, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7893333333333333, "grad_norm": 0.6295534887147133, "kl": 0.257080078125, "learning_rate": 3.4616990937940207e-07, "loss": 0.0003, "num_tokens": 33306431.0, "reward": 1.1484375, "reward_std": 0.23920938372612, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7928888888888889, "grad_norm": 0.3718764827518923, "kl": 0.261962890625, "learning_rate": 3.448381136692089e-07, "loss": 0.0003, "num_tokens": 33456515.0, "reward": 1.140625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.7964444444444444, "grad_norm": 0.5066460692369736, "kl": 0.2646484375, "learning_rate": 3.435031666684771e-07, "loss": 0.0003, "num_tokens": 33606655.0, "reward": 1.1171875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8, "grad_norm": 0.37485398051866725, "kl": 0.2615966796875, "learning_rate": 3.421651127349622e-07, "loss": 0.0003, "num_tokens": 33756755.0, "reward": 1.1796875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8035555555555556, "grad_norm": 1.1903519435164054, "kl": 0.5228271484375, "learning_rate": 3.4082399632965696e-07, "loss": 0.0005, "num_tokens": 33906883.0, "reward": 1.09375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8071111111111111, "grad_norm": 0.5760439879380863, "kl": 0.2958984375, "learning_rate": 3.394798620153147e-07, "loss": 0.0003, "num_tokens": 34056971.0, "reward": 1.0234375, "reward_std": 0.140625, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8106666666666666, "grad_norm": 0.38435794906739096, "kl": 0.3095703125, "learning_rate": 3.3813275445496766e-07, "loss": 0.0003, "num_tokens": 34207035.0, "reward": 1.0546875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8142222222222222, "grad_norm": 0.4775237284211774, "kl": 0.28271484375, "learning_rate": 3.367827184104437e-07, "loss": 0.0003, "num_tokens": 34357151.0, "reward": 1.1953125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8177777777777778, "grad_norm": 0.43713842043025036, "kl": 0.2818603515625, "learning_rate": 3.354297987408784e-07, "loss": 0.0003, "num_tokens": 34507319.0, "reward": 1.09375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8213333333333334, "grad_norm": 0.2758218043524574, "kl": 0.2635498046875, "learning_rate": 3.340740404012251e-07, "loss": 0.0003, "num_tokens": 34657495.0, "reward": 1.0078125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8248888888888889, "grad_norm": 0.2272360139614416, "kl": 0.266357421875, "learning_rate": 3.3271548844076034e-07, "loss": 0.0003, "num_tokens": 34807727.0, "reward": 1.0234375, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8284444444444444, "grad_norm": 0.3964722183721534, "kl": 0.268310546875, "learning_rate": 3.313541880015877e-07, "loss": 0.0003, "num_tokens": 34957847.0, "reward": 1.0546875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.832, "grad_norm": 0.38546815849551924, "kl": 0.2796630859375, "learning_rate": 3.299901843171374e-07, "loss": 0.0003, "num_tokens": 35108011.0, "reward": 1.1015625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8355555555555556, "grad_norm": 0.45715949412521334, "kl": 0.330078125, "learning_rate": 3.2862352271066324e-07, "loss": 0.0003, "num_tokens": 35258095.0, "reward": 1.1328125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8391111111111111, "grad_norm": 0.36214907324934437, "kl": 0.2708740234375, "learning_rate": 3.272542485937368e-07, "loss": 0.0003, "num_tokens": 35408247.0, "reward": 1.03125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8426666666666667, "grad_norm": 0.3568571472127005, "kl": 0.2841796875, "learning_rate": 3.2588240746473866e-07, "loss": 0.0003, "num_tokens": 35558331.0, "reward": 1.0625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8462222222222222, "grad_norm": 0.2510920373295251, "kl": 0.2857666015625, "learning_rate": 3.245080449073459e-07, "loss": 0.0003, "num_tokens": 35708423.0, "reward": 1.046875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8497777777777777, "grad_norm": 0.25551782734120665, "kl": 0.26123046875, "learning_rate": 3.231312065890183e-07, "loss": 0.0003, "num_tokens": 35858571.0, "reward": 1.015625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8533333333333334, "grad_norm": 1.0986739740955342, "kl": 0.465087890625, "learning_rate": 3.217519382594801e-07, "loss": 0.0005, "num_tokens": 36008667.0, "reward": 1.1328125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8568888888888889, "grad_norm": 1.4680134596394123, "kl": 0.5418701171875, "learning_rate": 3.203702857492005e-07, "loss": 0.0005, "num_tokens": 36158787.0, "reward": 1.1015625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8604444444444445, "grad_norm": 3.504419780543159, "kl": 0.490966796875, "learning_rate": 3.189862949678704e-07, "loss": 0.0005, "num_tokens": 36308959.0, "reward": 1.0859375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.864, "grad_norm": 0.3205216146761314, "kl": 0.2939453125, "learning_rate": 3.1760001190287695e-07, "loss": 0.0003, "num_tokens": 36458955.0, "reward": 1.109375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8675555555555555, "grad_norm": 0.31722626012969196, "kl": 0.275634765625, "learning_rate": 3.162114826177756e-07, "loss": 0.0003, "num_tokens": 36609039.0, "reward": 1.2265625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8711111111111111, "grad_norm": 0.49376435813182934, "kl": 0.292724609375, "learning_rate": 3.148207532507595e-07, "loss": 0.0003, "num_tokens": 36759091.0, "reward": 1.1328125, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8746666666666667, "grad_norm": 0.4078375604153148, "kl": 0.2742919921875, "learning_rate": 3.134278700131262e-07, "loss": 0.0003, "num_tokens": 36909219.0, "reward": 1.0, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8782222222222222, "grad_norm": 0.4996956196257578, "kl": 0.276611328125, "learning_rate": 3.1203287918774224e-07, "loss": 0.0003, "num_tokens": 37059287.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8817777777777778, "grad_norm": 0.48726905512159585, "kl": 0.2777099609375, "learning_rate": 3.106358271275056e-07, "loss": 0.0003, "num_tokens": 37209299.0, "reward": 1.15625, "reward_std": 0.14304219186306, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8853333333333333, "grad_norm": 0.36736969448168444, "kl": 0.289794921875, "learning_rate": 3.0923676025380483e-07, "loss": 0.0003, "num_tokens": 37359383.0, "reward": 1.0703125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8888888888888888, "grad_norm": 3.4446668270948635, "kl": 0.556640625, "learning_rate": 3.078357250549772e-07, "loss": 0.0006, "num_tokens": 37509467.0, "reward": 1.1015625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8924444444444445, "grad_norm": 0.38988927409649893, "kl": 0.28759765625, "learning_rate": 3.064327680847635e-07, "loss": 0.0003, "num_tokens": 37659551.0, "reward": 1.09375, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.896, "grad_norm": 0.39501782560739074, "kl": 0.365478515625, "learning_rate": 3.0502793596076136e-07, "loss": 0.0004, "num_tokens": 37809679.0, "reward": 1.0859375, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.8995555555555556, "grad_norm": 0.6831753974217399, "kl": 0.3150634765625, "learning_rate": 3.0362127536287636e-07, "loss": 0.0003, "num_tokens": 37959747.0, "reward": 1.109375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9031111111111111, "grad_norm": 18.18651294130415, "kl": 1.068603515625, "learning_rate": 3.022128330317705e-07, "loss": 0.0011, "num_tokens": 38109831.0, "reward": 1.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9066666666666666, "grad_norm": 1.5209772510903372, "kl": 0.88330078125, "learning_rate": 3.0080265576730977e-07, "loss": 0.0009, "num_tokens": 38259963.0, "reward": 1.0078125, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9102222222222223, "grad_norm": 0.7730838959893301, "kl": 0.452392578125, "learning_rate": 2.993907904270084e-07, "loss": 0.0005, "num_tokens": 38410079.0, "reward": 1.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9137777777777778, "grad_norm": 0.40952708495860707, "kl": 0.271728515625, "learning_rate": 2.979772839244723e-07, "loss": 0.0003, "num_tokens": 38560299.0, "reward": 1.0546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9173333333333333, "grad_norm": 0.5131983702741918, "kl": 0.287109375, "learning_rate": 2.965621832278401e-07, "loss": 0.0003, "num_tokens": 38710367.0, "reward": 1.0625, "reward_std": 0.10605771839618683, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9208888888888889, "grad_norm": 0.27991645858337005, "kl": 0.280029296875, "learning_rate": 2.951455353582224e-07, "loss": 0.0003, "num_tokens": 38860451.0, "reward": 1.03125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9244444444444444, "grad_norm": 0.3695586276265612, "kl": 0.284423828125, "learning_rate": 2.937273873881396e-07, "loss": 0.0003, "num_tokens": 39010531.0, "reward": 1.078125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.928, "grad_norm": 0.32295542952532946, "kl": 0.2978515625, "learning_rate": 2.9230778643995724e-07, "loss": 0.0003, "num_tokens": 39160527.0, "reward": 1.078125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9315555555555556, "grad_norm": 0.13720612956039277, "kl": 0.28173828125, "learning_rate": 2.90886779684321e-07, "loss": 0.0003, "num_tokens": 39310607.0, "reward": 1.0703125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9351111111111111, "grad_norm": 0.4018144462500134, "kl": 0.29150390625, "learning_rate": 2.894644143385885e-07, "loss": 0.0003, "num_tokens": 39460783.0, "reward": 1.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9386666666666666, "grad_norm": 0.4442504823873414, "kl": 0.2998046875, "learning_rate": 2.8804073766526095e-07, "loss": 0.0003, "num_tokens": 39610787.0, "reward": 1.2265625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9422222222222222, "grad_norm": 0.4440909084713661, "kl": 0.30126953125, "learning_rate": 2.866157969704125e-07, "loss": 0.0003, "num_tokens": 39760827.0, "reward": 1.109375, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9457777777777778, "grad_norm": 0.5269613873598679, "kl": 0.413818359375, "learning_rate": 2.851896396021181e-07, "loss": 0.0004, "num_tokens": 39910887.0, "reward": 1.1640625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9493333333333334, "grad_norm": 0.4007808970790064, "kl": 0.2958984375, "learning_rate": 2.837623129488808e-07, "loss": 0.0003, "num_tokens": 40060939.0, "reward": 1.1796875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9528888888888889, "grad_norm": 0.35752016362078665, "kl": 0.323486328125, "learning_rate": 2.823338644380566e-07, "loss": 0.0003, "num_tokens": 40210967.0, "reward": 1.171875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9564444444444444, "grad_norm": 0.3659963766807816, "kl": 0.302978515625, "learning_rate": 2.809043415342784e-07, "loss": 0.0003, "num_tokens": 40361083.0, "reward": 1.078125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.96, "grad_norm": 0.43326692126792443, "kl": 0.28662109375, "learning_rate": 2.794737917378797e-07, "loss": 0.0003, "num_tokens": 40511163.0, "reward": 1.2421875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.2421875, "rewards/equation_reward_func/std": 0.4300905168056488, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9635555555555556, "grad_norm": 0.3008406369990196, "kl": 0.285400390625, "learning_rate": 2.780422625833153e-07, "loss": 0.0003, "num_tokens": 40661207.0, "reward": 1.1015625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9671111111111111, "grad_norm": 0.47409102685589544, "kl": 0.304931640625, "learning_rate": 2.766098016375823e-07, "loss": 0.0003, "num_tokens": 40811323.0, "reward": 1.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9706666666666667, "grad_norm": 0.5824521290348351, "kl": 0.296630859375, "learning_rate": 2.751764564986396e-07, "loss": 0.0003, "num_tokens": 40961391.0, "reward": 1.078125, "reward_std": 0.125, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9742222222222222, "grad_norm": 0.3098249242942063, "kl": 0.271728515625, "learning_rate": 2.737422747938259e-07, "loss": 0.0003, "num_tokens": 41111499.0, "reward": 1.078125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9777777777777777, "grad_norm": 0.3768856479378887, "kl": 0.2998046875, "learning_rate": 2.723073041782776e-07, "loss": 0.0003, "num_tokens": 41261595.0, "reward": 1.1171875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9813333333333333, "grad_norm": 0.321022268469654, "kl": 0.305419921875, "learning_rate": 2.708715923333451e-07, "loss": 0.0003, "num_tokens": 41411671.0, "reward": 1.0234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9848888888888889, "grad_norm": 0.44295012274215156, "kl": 0.318115234375, "learning_rate": 2.6943518696500835e-07, "loss": 0.0003, "num_tokens": 41561679.0, "reward": 1.2890625, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.3046875, "rewards/equation_reward_func/std": 0.46208351850509644, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9884444444444445, "grad_norm": 0.33094736446897677, "kl": 0.3056640625, "learning_rate": 2.6799813580229174e-07, "loss": 0.0003, "num_tokens": 41711767.0, "reward": 1.140625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.992, "grad_norm": 0.4091229618206246, "kl": 0.298583984375, "learning_rate": 2.6656048659567834e-07, "loss": 0.0003, "num_tokens": 41861911.0, "reward": 1.0859375, "reward_std": 0.11046179383993149, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9955555555555555, "grad_norm": 0.3775067433097031, "kl": 0.2855224609375, "learning_rate": 2.65122287115523e-07, "loss": 0.0003, "num_tokens": 42012051.0, "reward": 1.1015625, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 0.9991111111111111, "grad_norm": 0.2903344484393435, "kl": 0.287109375, "learning_rate": 2.63683585150465e-07, "loss": 0.0003, "num_tokens": 42162171.0, "reward": 1.03125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0035555555555555, "grad_norm": 0.5198369640431273, "kl": 0.301513671875, "learning_rate": 2.622444285058404e-07, "loss": 0.0003, "num_tokens": 42312203.0, "reward": 1.1484375, "reward_std": 0.11420939117670059, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.007111111111111, "grad_norm": 0.36920919300132393, "kl": 0.279052734375, "learning_rate": 2.6080486500209347e-07, "loss": 0.0003, "num_tokens": 42462203.0, "reward": 1.0234375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0106666666666666, "grad_norm": 0.5785443150378919, "kl": 0.30712890625, "learning_rate": 2.5936494247318733e-07, "loss": 0.0003, "num_tokens": 42612307.0, "reward": 1.0625, "reward_std": 0.15534991025924683, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0142222222222221, "grad_norm": 0.5720624594932939, "kl": 0.316650390625, "learning_rate": 2.5792470876501517e-07, "loss": 0.0003, "num_tokens": 42762499.0, "reward": 1.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0177777777777777, "grad_norm": 0.8260734017133515, "kl": 0.442626953125, "learning_rate": 2.5648421173380974e-07, "loss": 0.0004, "num_tokens": 42912579.0, "reward": 1.1171875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0213333333333334, "grad_norm": 0.42283670624523484, "kl": 0.2974853515625, "learning_rate": 2.550434992445538e-07, "loss": 0.0003, "num_tokens": 43062627.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.024888888888889, "grad_norm": 0.6536485123354365, "kl": 0.310546875, "learning_rate": 2.536026191693893e-07, "loss": 0.0003, "num_tokens": 43212699.0, "reward": 1.140625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0284444444444445, "grad_norm": 0.2842967079399991, "kl": 0.2734375, "learning_rate": 2.521616193860266e-07, "loss": 0.0003, "num_tokens": 43362827.0, "reward": 1.078125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.032, "grad_norm": 0.3454425051822889, "kl": 0.293212890625, "learning_rate": 2.507205477761539e-07, "loss": 0.0003, "num_tokens": 43512883.0, "reward": 1.140625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0355555555555556, "grad_norm": 0.45884634779664035, "kl": 0.317138671875, "learning_rate": 2.4927945222384613e-07, "loss": 0.0003, "num_tokens": 43662995.0, "reward": 1.03125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.039111111111111, "grad_norm": 0.4904684323745879, "kl": 0.35498046875, "learning_rate": 2.4783838061397334e-07, "loss": 0.0004, "num_tokens": 43813083.0, "reward": 1.109375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0426666666666666, "grad_norm": 0.3779515219530343, "kl": 0.302978515625, "learning_rate": 2.4639738083061073e-07, "loss": 0.0003, "num_tokens": 43963099.0, "reward": 1.0859375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0462222222222222, "grad_norm": 0.5521510485136667, "kl": 0.298828125, "learning_rate": 2.4495650075544613e-07, "loss": 0.0003, "num_tokens": 44113231.0, "reward": 1.125, "reward_std": 0.16591878235340118, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0497777777777777, "grad_norm": 0.3580518334835267, "kl": 0.32470703125, "learning_rate": 2.435157882661903e-07, "loss": 0.0003, "num_tokens": 44263335.0, "reward": 1.140625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0533333333333332, "grad_norm": 0.5049863161854076, "kl": 0.28759765625, "learning_rate": 2.420752912349848e-07, "loss": 0.0003, "num_tokens": 44413459.0, "reward": 1.0546875, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.056888888888889, "grad_norm": 0.3210601564293952, "kl": 0.287353515625, "learning_rate": 2.4063505752681265e-07, "loss": 0.0003, "num_tokens": 44563443.0, "reward": 1.1015625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0604444444444445, "grad_norm": 0.3841327605748596, "kl": 0.302978515625, "learning_rate": 2.3919513499790646e-07, "loss": 0.0003, "num_tokens": 44713483.0, "reward": 1.0703125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.064, "grad_norm": 0.36848414149450265, "kl": 0.2978515625, "learning_rate": 2.3775557149415953e-07, "loss": 0.0003, "num_tokens": 44863531.0, "reward": 1.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0675555555555556, "grad_norm": 0.29905575227014863, "kl": 0.277099609375, "learning_rate": 2.3631641484953493e-07, "loss": 0.0003, "num_tokens": 45013643.0, "reward": 1.078125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0711111111111111, "grad_norm": 0.41114495746607604, "kl": 0.2890625, "learning_rate": 2.3487771288447703e-07, "loss": 0.0003, "num_tokens": 45163707.0, "reward": 1.171875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0746666666666667, "grad_norm": 5.715003606598738, "kl": 0.7919921875, "learning_rate": 2.3343951340432158e-07, "loss": 0.0008, "num_tokens": 45313775.0, "reward": 1.0859375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 652.0, "completions/mean_length": 1021.09375, "completions/mean_terminated_length": 652.0, "completions/min_length": 652.0, "completions/min_terminated_length": 652.0, "epoch": 1.0782222222222222, "grad_norm": 217.93165237405415, "kl": 15.4482421875, "learning_rate": 2.3200186419770823e-07, "loss": 0.0155, "num_tokens": 45463523.0, "reward": 1.078125, "reward_std": 0.13466878235340118, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0817777777777777, "grad_norm": 0.47159212158122316, "kl": 0.30078125, "learning_rate": 2.3056481303499163e-07, "loss": 0.0003, "num_tokens": 45613587.0, "reward": 1.078125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0853333333333333, "grad_norm": 1.6168823274880646, "kl": 0.556884765625, "learning_rate": 2.291284076666549e-07, "loss": 0.0006, "num_tokens": 45763747.0, "reward": 1.046875, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0888888888888888, "grad_norm": 0.46162617703210324, "kl": 0.29248046875, "learning_rate": 2.2769269582172236e-07, "loss": 0.0003, "num_tokens": 45913835.0, "reward": 1.09375, "reward_std": 0.09858438372612, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0924444444444443, "grad_norm": 0.4671638304135226, "kl": 0.32568359375, "learning_rate": 2.262577252061741e-07, "loss": 0.0003, "num_tokens": 46063903.0, "reward": 1.1171875, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.096, "grad_norm": 0.4936514315382175, "kl": 0.294921875, "learning_rate": 2.2482354350136043e-07, "loss": 0.0003, "num_tokens": 46213947.0, "reward": 1.09375, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.0995555555555556, "grad_norm": 0.27593094177212785, "kl": 0.283203125, "learning_rate": 2.2339019836241768e-07, "loss": 0.0003, "num_tokens": 46364079.0, "reward": 1.109375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1031111111111112, "grad_norm": 0.617656921158583, "kl": 0.312255859375, "learning_rate": 2.219577374166847e-07, "loss": 0.0003, "num_tokens": 46514171.0, "reward": 1.09375, "reward_std": 0.15625, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1066666666666667, "grad_norm": 0.44450597120036134, "kl": 0.3004150390625, "learning_rate": 2.2052620826212031e-07, "loss": 0.0003, "num_tokens": 46664187.0, "reward": 1.171875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1102222222222222, "grad_norm": 0.5057759573090262, "kl": 0.284912109375, "learning_rate": 2.1909565846572158e-07, "loss": 0.0003, "num_tokens": 46814311.0, "reward": 1.1015625, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1137777777777778, "grad_norm": 0.3201288640949846, "kl": 0.2760009765625, "learning_rate": 2.1766613556194344e-07, "loss": 0.0003, "num_tokens": 46964423.0, "reward": 1.1015625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1173333333333333, "grad_norm": 0.3772490752847668, "kl": 0.283203125, "learning_rate": 2.1623768705111914e-07, "loss": 0.0003, "num_tokens": 47114503.0, "reward": 1.0859375, "reward_std": 0.08469823002815247, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1208888888888888, "grad_norm": 0.5025688072949658, "kl": 0.290283203125, "learning_rate": 2.1481036039788185e-07, "loss": 0.0003, "num_tokens": 47264599.0, "reward": 1.1328125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1244444444444444, "grad_norm": 0.38392968338478706, "kl": 0.28564453125, "learning_rate": 2.133842030295875e-07, "loss": 0.0003, "num_tokens": 47414671.0, "reward": 1.1015625, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1280000000000001, "grad_norm": 0.49457010221602965, "kl": 0.280029296875, "learning_rate": 2.1195926233473905e-07, "loss": 0.0003, "num_tokens": 47564755.0, "reward": 1.1015625, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1315555555555556, "grad_norm": 0.4244083602755102, "kl": 0.321533203125, "learning_rate": 2.105355856614115e-07, "loss": 0.0003, "num_tokens": 47714827.0, "reward": 1.1171875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1351111111111112, "grad_norm": 0.37474376222317474, "kl": 0.320556640625, "learning_rate": 2.0911322031567907e-07, "loss": 0.0003, "num_tokens": 47864947.0, "reward": 1.125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1386666666666667, "grad_norm": 0.41223649731911943, "kl": 0.3602294921875, "learning_rate": 2.076922135600427e-07, "loss": 0.0004, "num_tokens": 48015063.0, "reward": 1.03125, "reward_std": 0.08537659049034119, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1422222222222222, "grad_norm": 0.39432796084885563, "kl": 0.28173828125, "learning_rate": 2.0627261261186048e-07, "loss": 0.0003, "num_tokens": 48165143.0, "reward": 1.0234375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1457777777777778, "grad_norm": 0.4626204719118449, "kl": 0.289794921875, "learning_rate": 2.0485446464177752e-07, "loss": 0.0003, "num_tokens": 48315247.0, "reward": 1.140625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1493333333333333, "grad_norm": 0.5425416938386872, "kl": 0.2747802734375, "learning_rate": 2.034378167721599e-07, "loss": 0.0003, "num_tokens": 48465395.0, "reward": 1.140625, "reward_std": 0.16108438372612, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1528888888888889, "grad_norm": 0.360456611267802, "kl": 0.2930908203125, "learning_rate": 2.0202271607552766e-07, "loss": 0.0003, "num_tokens": 48615543.0, "reward": 1.0703125, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1564444444444444, "grad_norm": 0.5661057550331362, "kl": 0.30810546875, "learning_rate": 2.006092095729916e-07, "loss": 0.0003, "num_tokens": 48765671.0, "reward": 1.0, "reward_std": 0.125, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.16, "grad_norm": 0.41539473398116905, "kl": 0.2861328125, "learning_rate": 1.9919734423269018e-07, "loss": 0.0003, "num_tokens": 48915803.0, "reward": 1.03125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1635555555555555, "grad_norm": 3.0305326916437685, "kl": 0.519775390625, "learning_rate": 1.9778716696822948e-07, "loss": 0.0005, "num_tokens": 49065907.0, "reward": 1.1171875, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 463.0, "completions/mean_length": 1019.6171875, "completions/mean_terminated_length": 463.0, "completions/min_length": 463.0, "completions/min_terminated_length": 463.0, "epoch": 1.1671111111111112, "grad_norm": 0.23250382669343098, "kl": 0.2734375, "learning_rate": 1.9637872463712362e-07, "loss": 0.0003, "num_tokens": 49215434.0, "reward": 1.0546875, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1706666666666667, "grad_norm": 0.5016066341552496, "kl": 0.31689453125, "learning_rate": 1.9497206403923864e-07, "loss": 0.0003, "num_tokens": 49365470.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1742222222222223, "grad_norm": 0.3723114856113278, "kl": 0.294189453125, "learning_rate": 1.9356723191523646e-07, "loss": 0.0003, "num_tokens": 49515474.0, "reward": 1.109375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1777777777777778, "grad_norm": 0.5268422251511491, "kl": 0.311279296875, "learning_rate": 1.921642749450228e-07, "loss": 0.0003, "num_tokens": 49665510.0, "reward": 1.1015625, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1813333333333333, "grad_norm": 0.3681994364570604, "kl": 0.2958984375, "learning_rate": 1.9076323974619512e-07, "loss": 0.0003, "num_tokens": 49815606.0, "reward": 1.125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1848888888888889, "grad_norm": 0.33480600776322345, "kl": 0.2802734375, "learning_rate": 1.8936417287249446e-07, "loss": 0.0003, "num_tokens": 49965686.0, "reward": 1.0546875, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1884444444444444, "grad_norm": 0.5051846564957432, "kl": 0.2933349609375, "learning_rate": 1.8796712081225774e-07, "loss": 0.0003, "num_tokens": 50115722.0, "reward": 1.0546875, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.192, "grad_norm": 0.39018801865142605, "kl": 0.2841796875, "learning_rate": 1.8657212998687388e-07, "loss": 0.0003, "num_tokens": 50265858.0, "reward": 1.1171875, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.1955555555555555, "grad_norm": 2.454466270426045, "kl": 0.592041015625, "learning_rate": 1.8517924674924046e-07, "loss": 0.0006, "num_tokens": 50415962.0, "reward": 1.0859375, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.199111111111111, "grad_norm": 0.3442676416516453, "kl": 0.28466796875, "learning_rate": 1.8378851738222439e-07, "loss": 0.0003, "num_tokens": 50566102.0, "reward": 1.046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2026666666666666, "grad_norm": 0.4811344633751864, "kl": 0.35986328125, "learning_rate": 1.82399988097123e-07, "loss": 0.0004, "num_tokens": 50716226.0, "reward": 1.078125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2062222222222223, "grad_norm": 0.414941287812233, "kl": 0.304443359375, "learning_rate": 1.8101370503212962e-07, "loss": 0.0003, "num_tokens": 50866226.0, "reward": 1.1640625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2097777777777778, "grad_norm": 0.34063767725339195, "kl": 0.2890625, "learning_rate": 1.7962971425079946e-07, "loss": 0.0003, "num_tokens": 51016326.0, "reward": 1.09375, "reward_std": 0.0767945945262909, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 1023.109375, "completions/mean_terminated_length": 910.0, "completions/min_length": 910.0, "completions/min_terminated_length": 910.0, "epoch": 1.2133333333333334, "grad_norm": 0.49192139215580927, "kl": 0.346923828125, "learning_rate": 1.7824806174051994e-07, "loss": 0.0003, "num_tokens": 51166188.0, "reward": 1.15625, "reward_std": 0.10605771094560623, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.216888888888889, "grad_norm": 0.443226987052731, "kl": 0.38330078125, "learning_rate": 1.7686879341098172e-07, "loss": 0.0004, "num_tokens": 51316348.0, "reward": 1.015625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2204444444444444, "grad_norm": 0.5782232075315293, "kl": 0.3115234375, "learning_rate": 1.7549195509265407e-07, "loss": 0.0003, "num_tokens": 51466500.0, "reward": 1.2109375, "reward_std": 0.19958597421646118, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.984375, "completions/max_length": 1024.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 1018.734375, "completions/mean_terminated_length": 687.0, "completions/min_length": 510.0, "completions/min_terminated_length": 510.0, "epoch": 1.224, "grad_norm": 0.36297139808479756, "kl": 0.309814453125, "learning_rate": 1.7411759253526137e-07, "loss": 0.0003, "num_tokens": 51616002.0, "reward": 1.046875, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2275555555555555, "grad_norm": 0.31239331329351544, "kl": 0.33447265625, "learning_rate": 1.7274575140626315e-07, "loss": 0.0003, "num_tokens": 51766162.0, "reward": 1.0703125, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.231111111111111, "grad_norm": 0.3169212178434831, "kl": 0.3173828125, "learning_rate": 1.713764772893368e-07, "loss": 0.0003, "num_tokens": 51916270.0, "reward": 1.015625, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2346666666666666, "grad_norm": 0.30315931045979133, "kl": 0.302490234375, "learning_rate": 1.7000981568286263e-07, "loss": 0.0003, "num_tokens": 52066298.0, "reward": 1.2265625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.234375, "rewards/equation_reward_func/std": 0.42527204751968384, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2382222222222223, "grad_norm": 0.36563422908896903, "kl": 0.351806640625, "learning_rate": 1.6864581199841226e-07, "loss": 0.0004, "num_tokens": 52216406.0, "reward": 1.046875, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2417777777777779, "grad_norm": 0.4217921525746735, "kl": 0.3779296875, "learning_rate": 1.6728451155923966e-07, "loss": 0.0004, "num_tokens": 52366506.0, "reward": 1.0546875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2453333333333334, "grad_norm": 0.324461315875632, "kl": 0.289794921875, "learning_rate": 1.6592595959877493e-07, "loss": 0.0003, "num_tokens": 52516566.0, "reward": 1.0859375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.248888888888889, "grad_norm": 0.4568605162309185, "kl": 0.3271484375, "learning_rate": 1.6457020125912158e-07, "loss": 0.0003, "num_tokens": 52666654.0, "reward": 1.1953125, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2524444444444445, "grad_norm": 0.25417793013727236, "kl": 0.29541015625, "learning_rate": 1.6321728158955633e-07, "loss": 0.0003, "num_tokens": 52816754.0, "reward": 1.0390625, "reward_std": 0.041140519082546234, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.256, "grad_norm": 0.41489797113184423, "kl": 0.35595703125, "learning_rate": 1.6186724554503237e-07, "loss": 0.0004, "num_tokens": 52966926.0, "reward": 1.09375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2595555555555555, "grad_norm": 0.40881954060029546, "kl": 0.2998046875, "learning_rate": 1.6052013798468528e-07, "loss": 0.0003, "num_tokens": 53117030.0, "reward": 1.0390625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.263111111111111, "grad_norm": 0.4161740859363292, "kl": 0.302734375, "learning_rate": 1.5917600367034302e-07, "loss": 0.0003, "num_tokens": 53267102.0, "reward": 1.0859375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2666666666666666, "grad_norm": 0.48764370226892895, "kl": 0.306884765625, "learning_rate": 1.578348872650378e-07, "loss": 0.0003, "num_tokens": 53417230.0, "reward": 1.171875, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2702222222222221, "grad_norm": 0.2478225774564058, "kl": 0.2763671875, "learning_rate": 1.564968333315229e-07, "loss": 0.0003, "num_tokens": 53567346.0, "reward": 1.0703125, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2737777777777777, "grad_norm": 0.3291619924331192, "kl": 0.29248046875, "learning_rate": 1.5516188633079107e-07, "loss": 0.0003, "num_tokens": 53717382.0, "reward": 1.15625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2773333333333334, "grad_norm": 0.25700581201739003, "kl": 0.3076171875, "learning_rate": 1.5383009062059794e-07, "loss": 0.0003, "num_tokens": 53867502.0, "reward": 1.0625, "reward_std": 0.056765519082546234, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.280888888888889, "grad_norm": 0.3056011189640017, "kl": 0.303955078125, "learning_rate": 1.525014904539873e-07, "loss": 0.0003, "num_tokens": 54017530.0, "reward": 1.1328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2844444444444445, "grad_norm": 0.4656676273125868, "kl": 0.284912109375, "learning_rate": 1.5117612997782158e-07, "loss": 0.0003, "num_tokens": 54167482.0, "reward": 1.1171875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.288, "grad_norm": 0.3826433724652753, "kl": 0.281005859375, "learning_rate": 1.49854053231314e-07, "loss": 0.0003, "num_tokens": 54317618.0, "reward": 1.03125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2915555555555556, "grad_norm": 0.4446154009676478, "kl": 0.314453125, "learning_rate": 1.4853530414456612e-07, "loss": 0.0003, "num_tokens": 54467774.0, "reward": 1.015625, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 529.0, "completions/mean_length": 1020.1328125, "completions/mean_terminated_length": 529.0, "completions/min_length": 529.0, "completions/min_terminated_length": 529.0, "epoch": 1.295111111111111, "grad_norm": 138.5626606914031, "kl": 9.446044921875, "learning_rate": 1.4721992653710718e-07, "loss": 0.0095, "num_tokens": 54617343.0, "reward": 1.0390625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.2986666666666666, "grad_norm": 0.4059259101024079, "kl": 0.28759765625, "learning_rate": 1.45907964116439e-07, "loss": 0.0003, "num_tokens": 54767375.0, "reward": 1.171875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3022222222222222, "grad_norm": 0.2976769970604099, "kl": 0.298828125, "learning_rate": 1.4459946047658305e-07, "loss": 0.0003, "num_tokens": 54917459.0, "reward": 1.0546875, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3057777777777777, "grad_norm": 0.9632489497898731, "kl": 0.34033203125, "learning_rate": 1.4329445909663194e-07, "loss": 0.0003, "num_tokens": 55067575.0, "reward": 1.2109375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3093333333333335, "grad_norm": 0.27141674456407305, "kl": 0.317626953125, "learning_rate": 1.4199300333930515e-07, "loss": 0.0003, "num_tokens": 55217663.0, "reward": 1.125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3128888888888888, "grad_norm": 0.3664732680899467, "kl": 0.296142578125, "learning_rate": 1.4069513644950744e-07, "loss": 0.0003, "num_tokens": 55367775.0, "reward": 1.125, "reward_std": 0.09483679383993149, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3164444444444445, "grad_norm": 0.4258773946699787, "kl": 0.30078125, "learning_rate": 1.394009015528927e-07, "loss": 0.0003, "num_tokens": 55517851.0, "reward": 1.1953125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.32, "grad_norm": 0.5173226603673574, "kl": 0.28515625, "learning_rate": 1.3811034165443036e-07, "loss": 0.0003, "num_tokens": 55667915.0, "reward": 1.078125, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3235555555555556, "grad_norm": 0.28284555705880354, "kl": 0.29296875, "learning_rate": 1.3682349963697676e-07, "loss": 0.0003, "num_tokens": 55818043.0, "reward": 1.1328125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3271111111111111, "grad_norm": 0.38120393105342143, "kl": 0.281005859375, "learning_rate": 1.3554041825985e-07, "loss": 0.0003, "num_tokens": 55968159.0, "reward": 1.1171875, "reward_std": 0.0924195945262909, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3306666666666667, "grad_norm": 0.274813313541087, "kl": 0.291259765625, "learning_rate": 1.3426114015740915e-07, "loss": 0.0003, "num_tokens": 56118259.0, "reward": 1.046875, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3342222222222222, "grad_norm": 0.22678726772648233, "kl": 0.2919921875, "learning_rate": 1.3298570783763805e-07, "loss": 0.0003, "num_tokens": 56268371.0, "reward": 1.109375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3377777777777777, "grad_norm": 0.41457129958616, "kl": 0.305908203125, "learning_rate": 1.31714163680732e-07, "loss": 0.0003, "num_tokens": 56418403.0, "reward": 1.125, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3413333333333333, "grad_norm": 0.5210145141470561, "kl": 0.292724609375, "learning_rate": 1.3044654993769044e-07, "loss": 0.0003, "num_tokens": 56568531.0, "reward": 1.0234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3448888888888888, "grad_norm": 0.4117092342286789, "kl": 0.30078125, "learning_rate": 1.2918290872891236e-07, "loss": 0.0003, "num_tokens": 56718539.0, "reward": 1.1171875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3484444444444446, "grad_norm": 0.4954791387781314, "kl": 0.310546875, "learning_rate": 1.2792328204279712e-07, "loss": 0.0003, "num_tokens": 56868623.0, "reward": 1.046875, "reward_std": 0.12983438372612, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3519999999999999, "grad_norm": 0.2545776704521135, "kl": 0.328369140625, "learning_rate": 1.2666771173434892e-07, "loss": 0.0003, "num_tokens": 57018775.0, "reward": 1.125, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3555555555555556, "grad_norm": 0.3146740336761455, "kl": 0.32421875, "learning_rate": 1.2541623952378655e-07, "loss": 0.0003, "num_tokens": 57168891.0, "reward": 1.1328125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3591111111111112, "grad_norm": 0.4579205523197049, "kl": 0.3515625, "learning_rate": 1.2416890699515636e-07, "loss": 0.0004, "num_tokens": 57319019.0, "reward": 1.109375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3626666666666667, "grad_norm": 0.36618428888693066, "kl": 0.30029296875, "learning_rate": 1.2292575559495143e-07, "loss": 0.0003, "num_tokens": 57469015.0, "reward": 1.09375, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3662222222222222, "grad_norm": 0.4686555030489483, "kl": 0.337890625, "learning_rate": 1.216868266307333e-07, "loss": 0.0003, "num_tokens": 57619207.0, "reward": 1.0859375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3697777777777778, "grad_norm": 0.46307942376493777, "kl": 0.348388671875, "learning_rate": 1.2045216126976054e-07, "loss": 0.0003, "num_tokens": 57769271.0, "reward": 1.1640625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3733333333333333, "grad_norm": 0.8005369371858218, "kl": 0.4453125, "learning_rate": 1.1922180053761985e-07, "loss": 0.0004, "num_tokens": 57919371.0, "reward": 1.0859375, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3768888888888888, "grad_norm": 0.41987593538846585, "kl": 0.41064453125, "learning_rate": 1.1799578531686355e-07, "loss": 0.0004, "num_tokens": 58069419.0, "reward": 1.0390625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3804444444444444, "grad_norm": 0.37356810239403104, "kl": 0.301513671875, "learning_rate": 1.1677415634565066e-07, "loss": 0.0003, "num_tokens": 58219491.0, "reward": 1.0859375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.384, "grad_norm": 0.3725985247012251, "kl": 0.304931640625, "learning_rate": 1.1555695421639369e-07, "loss": 0.0003, "num_tokens": 58369575.0, "reward": 1.21875, "reward_std": 0.09483679383993149, "rewards/equation_reward_func/mean": 0.2265625, "rewards/equation_reward_func/std": 0.4202519655227661, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3875555555555557, "grad_norm": 0.5494057898608353, "kl": 0.305908203125, "learning_rate": 1.1434421937440927e-07, "loss": 0.0003, "num_tokens": 58519615.0, "reward": 1.0859375, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3911111111111112, "grad_norm": 0.09051477444513548, "kl": 0.3125, "learning_rate": 1.1313599211657493e-07, "loss": 0.0003, "num_tokens": 58669707.0, "reward": 1.09375, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3946666666666667, "grad_norm": 0.4155817066153559, "kl": 0.292236328125, "learning_rate": 1.1193231258998933e-07, "loss": 0.0003, "num_tokens": 58819823.0, "reward": 1.0703125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.3982222222222223, "grad_norm": 0.6754576511921663, "kl": 0.573486328125, "learning_rate": 1.1073322079063913e-07, "loss": 0.0006, "num_tokens": 58969911.0, "reward": 1.1640625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4017777777777778, "grad_norm": 0.15297965653574863, "kl": 0.29541015625, "learning_rate": 1.0953875656206896e-07, "loss": 0.0003, "num_tokens": 59120027.0, "reward": 1.1171875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4053333333333333, "grad_norm": 0.19840806732730873, "kl": 0.3031005859375, "learning_rate": 1.083489595940586e-07, "loss": 0.0003, "num_tokens": 59270151.0, "reward": 0.9921875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4088888888888889, "grad_norm": 0.437741941119917, "kl": 0.2861328125, "learning_rate": 1.0716386942130312e-07, "loss": 0.0003, "num_tokens": 59420171.0, "reward": 1.171875, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1875, "rewards/equation_reward_func/std": 0.39184603095054626, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4124444444444444, "grad_norm": 0.2839123171859526, "kl": 0.334716796875, "learning_rate": 1.0598352542210021e-07, "loss": 0.0003, "num_tokens": 59570287.0, "reward": 1.0390625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.416, "grad_norm": 0.4426434710495713, "kl": 0.28955078125, "learning_rate": 1.0480796681704077e-07, "loss": 0.0003, "num_tokens": 59720451.0, "reward": 1.0, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4195555555555557, "grad_norm": 0.26505824000927997, "kl": 0.344482421875, "learning_rate": 1.0363723266770649e-07, "loss": 0.0003, "num_tokens": 59870531.0, "reward": 1.1640625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.423111111111111, "grad_norm": 0.49780745905277374, "kl": 0.314453125, "learning_rate": 1.0247136187537123e-07, "loss": 0.0003, "num_tokens": 60020615.0, "reward": 1.2109375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4266666666666667, "grad_norm": 0.353547985641691, "kl": 0.306884765625, "learning_rate": 1.0131039317970907e-07, "loss": 0.0003, "num_tokens": 60170703.0, "reward": 1.0078125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4302222222222223, "grad_norm": 0.3856632741740315, "kl": 0.318603515625, "learning_rate": 1.0015436515750636e-07, "loss": 0.0003, "num_tokens": 60320791.0, "reward": 1.0859375, "reward_std": 0.10364051908254623, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4337777777777778, "grad_norm": 0.21701469386568364, "kl": 0.29443359375, "learning_rate": 9.900331622138063e-08, "loss": 0.0003, "num_tokens": 60470819.0, "reward": 1.046875, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4373333333333334, "grad_norm": 0.3956199952252776, "kl": 0.31298828125, "learning_rate": 9.785728461850346e-08, "loss": 0.0003, "num_tokens": 60620927.0, "reward": 1.0390625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4408888888888889, "grad_norm": 0.43358722043382675, "kl": 0.333251953125, "learning_rate": 9.671630842933027e-08, "loss": 0.0003, "num_tokens": 60771023.0, "reward": 1.0859375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4444444444444444, "grad_norm": 0.43484180770648706, "kl": 0.308837890625, "learning_rate": 9.558042556633439e-08, "loss": 0.0003, "num_tokens": 60921135.0, "reward": 1.1015625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.448, "grad_norm": 0.5452317684518833, "kl": 0.335693359375, "learning_rate": 9.44496737727479e-08, "loss": 0.0003, "num_tokens": 61071251.0, "reward": 1.15625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4515555555555555, "grad_norm": 0.4385308886488731, "kl": 0.304443359375, "learning_rate": 9.332409062130686e-08, "loss": 0.0003, "num_tokens": 61221419.0, "reward": 1.1171875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.455111111111111, "grad_norm": 0.6583908943728609, "kl": 0.3759765625, "learning_rate": 9.220371351300352e-08, "loss": 0.0004, "num_tokens": 61371511.0, "reward": 1.109375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4586666666666668, "grad_norm": 0.4718113655947949, "kl": 0.315185546875, "learning_rate": 9.10885796758428e-08, "loss": 0.0003, "num_tokens": 61521515.0, "reward": 1.109375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.462222222222222, "grad_norm": 0.24662161853039036, "kl": 0.303466796875, "learning_rate": 8.997872616360603e-08, "loss": 0.0003, "num_tokens": 61671603.0, "reward": 1.1171875, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4657777777777778, "grad_norm": 0.5413466420478469, "kl": 0.296875, "learning_rate": 8.887418985461903e-08, "loss": 0.0003, "num_tokens": 61821727.0, "reward": 1.1640625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4693333333333334, "grad_norm": 0.49533967889419184, "kl": 0.3154296875, "learning_rate": 8.777500745052743e-08, "loss": 0.0003, "num_tokens": 61971763.0, "reward": 1.1953125, "reward_std": 0.13708597421646118, "rewards/equation_reward_func/mean": 0.21875, "rewards/equation_reward_func/std": 0.41502299904823303, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.472888888888889, "grad_norm": 0.045461932603756326, "kl": 0.291015625, "learning_rate": 8.668121547507634e-08, "loss": 0.0003, "num_tokens": 62121887.0, "reward": 1.09375, "reward_std": 0.0, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4764444444444444, "grad_norm": 0.41283123432479657, "kl": 0.3173828125, "learning_rate": 8.559285027289753e-08, "loss": 0.0003, "num_tokens": 62271995.0, "reward": 1.1171875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.48, "grad_norm": 0.35202272038106325, "kl": 0.310302734375, "learning_rate": 8.450994800830111e-08, "loss": 0.0003, "num_tokens": 62422143.0, "reward": 1.0546875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4835555555555555, "grad_norm": 0.34253260774893085, "kl": 0.29931640625, "learning_rate": 8.343254466407435e-08, "loss": 0.0003, "num_tokens": 62572191.0, "reward": 1.0546875, "reward_std": 0.05918271467089653, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.487111111111111, "grad_norm": 0.6192549377846177, "kl": 0.34423828125, "learning_rate": 8.236067604028562e-08, "loss": 0.0003, "num_tokens": 62722263.0, "reward": 1.1328125, "reward_std": 0.19958597421646118, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4906666666666666, "grad_norm": 0.42419250467683867, "kl": 0.330322265625, "learning_rate": 8.129437775309533e-08, "loss": 0.0003, "num_tokens": 62872423.0, "reward": 1.0703125, "reward_std": 0.07239051908254623, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4942222222222221, "grad_norm": 0.4600512751439256, "kl": 0.29296875, "learning_rate": 8.023368523357182e-08, "loss": 0.0003, "num_tokens": 63022531.0, "reward": 1.109375, "reward_std": 0.11662659049034119, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.4977777777777779, "grad_norm": 0.30707853703970844, "kl": 0.33349609375, "learning_rate": 7.917863372651476e-08, "loss": 0.0003, "num_tokens": 63172623.0, "reward": 1.1953125, "reward_std": 0.015625, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5013333333333332, "grad_norm": 0.3692855700280947, "kl": 0.319580078125, "learning_rate": 7.812925828928332e-08, "loss": 0.0003, "num_tokens": 63322715.0, "reward": 1.1484375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.504888888888889, "grad_norm": 0.4014494891551021, "kl": 0.286865234375, "learning_rate": 7.708559379063204e-08, "loss": 0.0003, "num_tokens": 63472819.0, "reward": 1.046875, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5084444444444445, "grad_norm": 0.4232608141165222, "kl": 0.30615234375, "learning_rate": 7.604767490955138e-08, "loss": 0.0003, "num_tokens": 63622875.0, "reward": 1.15625, "reward_std": 0.08801551908254623, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.512, "grad_norm": 0.4560293272329757, "kl": 0.30517578125, "learning_rate": 7.501553613411626e-08, "loss": 0.0003, "num_tokens": 63772987.0, "reward": 1.0078125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5155555555555555, "grad_norm": 0.4361736440468482, "kl": 0.33984375, "learning_rate": 7.398921176033928e-08, "loss": 0.0003, "num_tokens": 63923035.0, "reward": 1.046875, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.519111111111111, "grad_norm": 0.6635480222906371, "kl": 0.311767578125, "learning_rate": 7.296873589103184e-08, "loss": 0.0003, "num_tokens": 64073127.0, "reward": 1.09375, "reward_std": 0.15625, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5226666666666666, "grad_norm": 0.3459960132516468, "kl": 0.33349609375, "learning_rate": 7.195414243467029e-08, "loss": 0.0003, "num_tokens": 64223223.0, "reward": 1.09375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5262222222222221, "grad_norm": 0.37737333972639386, "kl": 0.30224609375, "learning_rate": 7.094546510426994e-08, "loss": 0.0003, "num_tokens": 64373331.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 1024.0, "completions/max_terminated_length": 882.0, "completions/mean_length": 1022.890625, "completions/mean_terminated_length": 882.0, "completions/min_length": 882.0, "completions/min_terminated_length": 882.0, "epoch": 1.529777777777778, "grad_norm": 0.7177685998564922, "kl": 0.31591796875, "learning_rate": 6.994273741626405e-08, "loss": 0.0003, "num_tokens": 64523209.0, "reward": 1.1640625, "reward_std": 0.18154378235340118, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5333333333333332, "grad_norm": 0.3255614188057404, "kl": 0.289306640625, "learning_rate": 6.8945992689391e-08, "loss": 0.0003, "num_tokens": 64673377.0, "reward": 1.0625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.536888888888889, "grad_norm": 1.754742235050356, "kl": 0.4462890625, "learning_rate": 6.795526404358628e-08, "loss": 0.0004, "num_tokens": 64823533.0, "reward": 0.9765625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5404444444444443, "grad_norm": 0.3643382370307102, "kl": 0.292724609375, "learning_rate": 6.697058439888283e-08, "loss": 0.0003, "num_tokens": 64973545.0, "reward": 1.1328125, "reward_std": 0.06975159049034119, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.544, "grad_norm": 0.630079375586487, "kl": 0.364501953125, "learning_rate": 6.599198647431642e-08, "loss": 0.0004, "num_tokens": 65123697.0, "reward": 1.03125, "reward_std": 0.14304219186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5475555555555556, "grad_norm": 0.28958212851563225, "kl": 0.322998046875, "learning_rate": 6.501950278683907e-08, "loss": 0.0003, "num_tokens": 65273741.0, "reward": 1.15625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.551111111111111, "grad_norm": 0.41053342748286836, "kl": 0.30419921875, "learning_rate": 6.405316565023805e-08, "loss": 0.0003, "num_tokens": 65423789.0, "reward": 1.078125, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5546666666666666, "grad_norm": 0.5111478111826663, "kl": 0.2998046875, "learning_rate": 6.309300717406274e-08, "loss": 0.0003, "num_tokens": 65573853.0, "reward": 1.1171875, "reward_std": 0.11420939117670059, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5582222222222222, "grad_norm": 0.3679464709884017, "kl": 0.310302734375, "learning_rate": 6.213905926255697e-08, "loss": 0.0003, "num_tokens": 65723841.0, "reward": 1.15625, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.1796875, "rewards/equation_reward_func/std": 0.3854354918003082, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.561777777777778, "grad_norm": 0.46692650885856746, "kl": 0.3310546875, "learning_rate": 6.119135361359965e-08, "loss": 0.0003, "num_tokens": 65873981.0, "reward": 1.0703125, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5653333333333332, "grad_norm": 0.38381494119670495, "kl": 0.307373046875, "learning_rate": 6.024992171765089e-08, "loss": 0.0003, "num_tokens": 66024057.0, "reward": 1.1171875, "reward_std": 0.06975159049034119, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.568888888888889, "grad_norm": 0.5471698963781498, "kl": 0.31103515625, "learning_rate": 5.9314794856705983e-08, "loss": 0.0003, "num_tokens": 66174081.0, "reward": 1.140625, "reward_std": 0.1392945945262909, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5724444444444443, "grad_norm": 0.4545334966796343, "kl": 0.297607421875, "learning_rate": 5.8386004103255975e-08, "loss": 0.0003, "num_tokens": 66324225.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.576, "grad_norm": 0.4235701460309408, "kl": 0.290771484375, "learning_rate": 5.7463580319254853e-08, "loss": 0.0003, "num_tokens": 66474309.0, "reward": 1.09375, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5795555555555556, "grad_norm": 0.3247726601860356, "kl": 0.308837890625, "learning_rate": 5.6547554155094626e-08, "loss": 0.0003, "num_tokens": 66624429.0, "reward": 1.109375, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5831111111111111, "grad_norm": 0.40435668824890736, "kl": 0.3369140625, "learning_rate": 5.563795604858615e-08, "loss": 0.0003, "num_tokens": 66774545.0, "reward": 1.0859375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5866666666666667, "grad_norm": 0.20151285242580103, "kl": 0.31005859375, "learning_rate": 5.473481622394849e-08, "loss": 0.0003, "num_tokens": 66924573.0, "reward": 1.015625, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5902222222222222, "grad_norm": 0.49584377279174674, "kl": 0.32763671875, "learning_rate": 5.3838164690803935e-08, "loss": 0.0003, "num_tokens": 67074721.0, "reward": 1.1015625, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5937777777777777, "grad_norm": 0.5709614104348788, "kl": 0.38916015625, "learning_rate": 5.294803124318145e-08, "loss": 0.0004, "num_tokens": 67224805.0, "reward": 1.2109375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.2109375, "rewards/equation_reward_func/std": 0.4095771610736847, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.5973333333333333, "grad_norm": 0.40666450684163086, "kl": 0.303955078125, "learning_rate": 5.20644454585262e-08, "loss": 0.0003, "num_tokens": 67374829.0, "reward": 1.0546875, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.600888888888889, "grad_norm": 0.5101919983503219, "kl": 0.3310546875, "learning_rate": 5.1187436696716906e-08, "loss": 0.0003, "num_tokens": 67524913.0, "reward": 1.0703125, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6044444444444443, "grad_norm": 0.4677649463602687, "kl": 0.3291015625, "learning_rate": 5.0317034099090524e-08, "loss": 0.0003, "num_tokens": 67674957.0, "reward": 1.015625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.608, "grad_norm": 0.47256502926562893, "kl": 0.3193359375, "learning_rate": 4.9453266587473423e-08, "loss": 0.0003, "num_tokens": 67825037.0, "reward": 1.078125, "reward_std": 0.11662659049034119, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6115555555555554, "grad_norm": 0.42644349348534893, "kl": 0.288818359375, "learning_rate": 4.859616286322094e-08, "loss": 0.0003, "num_tokens": 67975093.0, "reward": 0.9609375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6151111111111112, "grad_norm": 0.6524368836285781, "kl": 0.2926025390625, "learning_rate": 4.774575140626316e-08, "loss": 0.0003, "num_tokens": 68125177.0, "reward": 1.0625, "reward_std": 0.16108438372612, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6186666666666667, "grad_norm": 0.46164389030987435, "kl": 0.30322265625, "learning_rate": 4.6902060474159036e-08, "loss": 0.0003, "num_tokens": 68275213.0, "reward": 1.0625, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6222222222222222, "grad_norm": 1.0007665367327419, "kl": 0.3310546875, "learning_rate": 4.6065118101157016e-08, "loss": 0.0003, "num_tokens": 68425389.0, "reward": 1.0390625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6257777777777778, "grad_norm": 0.42753125244448825, "kl": 0.3046875, "learning_rate": 4.5234952097263965e-08, "loss": 0.0003, "num_tokens": 68575397.0, "reward": 1.09375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6293333333333333, "grad_norm": 0.46658144028533827, "kl": 0.312255859375, "learning_rate": 4.4411590047320617e-08, "loss": 0.0003, "num_tokens": 68725541.0, "reward": 1.0234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6328888888888888, "grad_norm": 0.4157853647166392, "kl": 0.33642578125, "learning_rate": 4.359505931008553e-08, "loss": 0.0003, "num_tokens": 68875697.0, "reward": 1.03125, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6364444444444444, "grad_norm": 0.5353473136872781, "kl": 0.32373046875, "learning_rate": 4.278538701732534e-08, "loss": 0.0003, "num_tokens": 69025717.0, "reward": 1.140625, "reward_std": 0.10605771839618683, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6400000000000001, "grad_norm": 0.2988427017643135, "kl": 0.302001953125, "learning_rate": 4.198260007291399e-08, "loss": 0.0003, "num_tokens": 69175869.0, "reward": 1.0859375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6435555555555554, "grad_norm": 0.6794574489935594, "kl": 0.313720703125, "learning_rate": 4.118672515193794e-08, "loss": 0.0003, "num_tokens": 69325917.0, "reward": 1.1015625, "reward_std": 0.17670938372612, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6471111111111112, "grad_norm": 1.0736204487313878, "kl": 0.387451171875, "learning_rate": 4.039778869981064e-08, "loss": 0.0004, "num_tokens": 69476013.0, "reward": 1.0078125, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6506666666666665, "grad_norm": 0.4936915298323271, "kl": 0.294189453125, "learning_rate": 3.961581693139307e-08, "loss": 0.0003, "num_tokens": 69626137.0, "reward": 1.0546875, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6542222222222223, "grad_norm": 0.45453076530383824, "kl": 0.311279296875, "learning_rate": 3.884083583012318e-08, "loss": 0.0003, "num_tokens": 69776221.0, "reward": 1.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6577777777777778, "grad_norm": 0.23245767178394858, "kl": 0.303466796875, "learning_rate": 3.807287114715216e-08, "loss": 0.0003, "num_tokens": 69926329.0, "reward": 1.1015625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6613333333333333, "grad_norm": 0.4361056139999621, "kl": 0.335693359375, "learning_rate": 3.731194840048915e-08, "loss": 0.0003, "num_tokens": 70076501.0, "reward": 0.984375, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6648888888888889, "grad_norm": 0.44868822636950484, "kl": 0.301513671875, "learning_rate": 3.655809287415284e-08, "loss": 0.0003, "num_tokens": 70226605.0, "reward": 1.015625, "reward_std": 0.11179219186306, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6684444444444444, "grad_norm": 0.50096997905928, "kl": 0.363037109375, "learning_rate": 3.581132961733191e-08, "loss": 0.0004, "num_tokens": 70376689.0, "reward": 1.09375, "reward_std": 0.10605771839618683, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6720000000000002, "grad_norm": 0.5934095945734738, "kl": 0.314697265625, "learning_rate": 3.5071683443552045e-08, "loss": 0.0003, "num_tokens": 70526745.0, "reward": 1.109375, "reward_std": 0.18858680129051208, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6755555555555555, "grad_norm": 0.8718041152389696, "kl": 0.49267578125, "learning_rate": 3.433917892985208e-08, "loss": 0.0005, "num_tokens": 70676793.0, "reward": 1.0390625, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6791111111111112, "grad_norm": 0.38376908265068144, "kl": 0.30908203125, "learning_rate": 3.3613840415966764e-08, "loss": 0.0003, "num_tokens": 70826861.0, "reward": 1.0703125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6826666666666665, "grad_norm": 0.2758967343775331, "kl": 0.295166015625, "learning_rate": 3.2895692003518575e-08, "loss": 0.0003, "num_tokens": 70976993.0, "reward": 1.09375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6862222222222223, "grad_norm": 0.30691530756523544, "kl": 0.302978515625, "learning_rate": 3.218475755521621e-08, "loss": 0.0003, "num_tokens": 71127021.0, "reward": 1.03125, "reward_std": 0.056765519082546234, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6897777777777778, "grad_norm": 0.4544709951437524, "kl": 0.33154296875, "learning_rate": 3.1481060694062365e-08, "loss": 0.0003, "num_tokens": 71277129.0, "reward": 1.0703125, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.6933333333333334, "grad_norm": 61.61147982083761, "kl": 6.212646484375, "learning_rate": 3.078462480256819e-08, "loss": 0.0062, "num_tokens": 71427205.0, "reward": 1.0390625, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.696888888888889, "grad_norm": 0.4754812555873108, "kl": 0.311767578125, "learning_rate": 3.0095473021976794e-08, "loss": 0.0003, "num_tokens": 71577265.0, "reward": 1.140625, "reward_std": 0.08537659049034119, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7004444444444444, "grad_norm": 0.46835463170958996, "kl": 0.3349609375, "learning_rate": 2.9413628251493934e-08, "loss": 0.0003, "num_tokens": 71727385.0, "reward": 1.0078125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.704, "grad_norm": 4.0148785973038805, "kl": 0.880859375, "learning_rate": 2.8739113147527417e-08, "loss": 0.0009, "num_tokens": 71877501.0, "reward": 1.046875, "reward_std": 0.14478103816509247, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7075555555555555, "grad_norm": 0.4965898386218197, "kl": 0.302978515625, "learning_rate": 2.8071950122934036e-08, "loss": 0.0003, "num_tokens": 72027633.0, "reward": 1.0234375, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7111111111111112, "grad_norm": 0.39335123516514386, "kl": 0.350341796875, "learning_rate": 2.7412161346275052e-08, "loss": 0.0004, "num_tokens": 72177693.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7146666666666666, "grad_norm": 0.25026214846151146, "kl": 0.312744140625, "learning_rate": 2.675976874107935e-08, "loss": 0.0003, "num_tokens": 72327781.0, "reward": 1.1484375, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7182222222222223, "grad_norm": 0.47011111911733117, "kl": 0.297119140625, "learning_rate": 2.611479398511518e-08, "loss": 0.0003, "num_tokens": 72477873.0, "reward": 1.1484375, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7217777777777776, "grad_norm": 0.4673084705795621, "kl": 0.302734375, "learning_rate": 2.5477258509669614e-08, "loss": 0.0003, "num_tokens": 72627957.0, "reward": 1.078125, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7253333333333334, "grad_norm": 0.449377162287945, "kl": 0.312744140625, "learning_rate": 2.4847183498836714e-08, "loss": 0.0003, "num_tokens": 72778025.0, "reward": 1.0625, "reward_std": 0.11662659049034119, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.728888888888889, "grad_norm": 0.45502967472019323, "kl": 0.30712890625, "learning_rate": 2.4224589888813263e-08, "loss": 0.0003, "num_tokens": 72928141.0, "reward": 1.1953125, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.203125, "rewards/equation_reward_func/std": 0.40390563011169434, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7324444444444445, "grad_norm": 0.30054183666403217, "kl": 0.292724609375, "learning_rate": 2.3609498367203467e-08, "loss": 0.0003, "num_tokens": 73078217.0, "reward": 1.125, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.736, "grad_norm": 0.5258673098250929, "kl": 0.356689453125, "learning_rate": 2.300192937233128e-08, "loss": 0.0004, "num_tokens": 73228321.0, "reward": 1.1171875, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7395555555555555, "grad_norm": 0.405287504222553, "kl": 0.350830078125, "learning_rate": 2.240190309256143e-08, "loss": 0.0004, "num_tokens": 73378473.0, "reward": 0.9921875, "reward_std": 0.07239051908254623, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.743111111111111, "grad_norm": 20.30319933604708, "kl": 2.790771484375, "learning_rate": 2.1809439465628382e-08, "loss": 0.0028, "num_tokens": 73528593.0, "reward": 1.0546875, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7466666666666666, "grad_norm": 0.4822430316707212, "kl": 0.316162109375, "learning_rate": 2.122455817797428e-08, "loss": 0.0003, "num_tokens": 73678733.0, "reward": 1.125, "reward_std": 0.06358679383993149, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7502222222222223, "grad_norm": 0.3578328345996123, "kl": 0.30517578125, "learning_rate": 2.0647278664094188e-08, "loss": 0.0003, "num_tokens": 73828813.0, "reward": 1.0546875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7537777777777777, "grad_norm": 0.550657608118331, "kl": 0.342041015625, "learning_rate": 2.007762010589098e-08, "loss": 0.0003, "num_tokens": 73978857.0, "reward": 1.0703125, "reward_std": 0.10364051908254623, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7573333333333334, "grad_norm": 0.641484434186097, "kl": 0.308837890625, "learning_rate": 1.9515601432037317e-08, "loss": 0.0003, "num_tokens": 74128929.0, "reward": 1.1171875, "reward_std": 0.15029378235340118, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.9453125, "rewards/format_reward_func/std": 0.22826264798641205, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7608888888888887, "grad_norm": 0.28265032311261334, "kl": 0.31787109375, "learning_rate": 1.8961241317347333e-08, "loss": 0.0003, "num_tokens": 74278929.0, "reward": 1.078125, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7644444444444445, "grad_norm": 0.35554010327724356, "kl": 0.30615234375, "learning_rate": 1.8414558182155456e-08, "loss": 0.0003, "num_tokens": 74429009.0, "reward": 1.09375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.768, "grad_norm": 0.44096260500690493, "kl": 0.324951171875, "learning_rate": 1.787557019170488e-08, "loss": 0.0003, "num_tokens": 74579081.0, "reward": 1.09375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7715555555555556, "grad_norm": 0.454168464225727, "kl": 0.333740234375, "learning_rate": 1.734429525554365e-08, "loss": 0.0003, "num_tokens": 74729137.0, "reward": 1.1875, "reward_std": 0.09858439117670059, "rewards/equation_reward_func/mean": 0.1953125, "rewards/equation_reward_func/std": 0.3979988098144531, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.775111111111111, "grad_norm": 0.3385678990485817, "kl": 0.325439453125, "learning_rate": 1.6820751026929674e-08, "loss": 0.0003, "num_tokens": 74879217.0, "reward": 1.1015625, "reward_std": 0.05170939117670059, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7786666666666666, "grad_norm": 0.35138058736700556, "kl": 0.2978515625, "learning_rate": 1.6304954902244095e-08, "loss": 0.0003, "num_tokens": 75029385.0, "reward": 0.9765625, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0, "rewards/equation_reward_func/std": 0.0, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7822222222222224, "grad_norm": 0.5869996441190313, "kl": 0.3193359375, "learning_rate": 1.5796924020413327e-08, "loss": 0.0003, "num_tokens": 75179401.0, "reward": 1.1171875, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7857777777777777, "grad_norm": 0.5058752959126767, "kl": 0.29052734375, "learning_rate": 1.529667526233941e-08, "loss": 0.0003, "num_tokens": 75329481.0, "reward": 1.1015625, "reward_std": 0.109375, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7893333333333334, "grad_norm": 0.6518639813557456, "kl": 0.38818359375, "learning_rate": 1.4804225250339281e-08, "loss": 0.0004, "num_tokens": 75479569.0, "reward": 1.109375, "reward_std": 0.1478765904903412, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7928888888888888, "grad_norm": 0.7704301788997951, "kl": 0.429443359375, "learning_rate": 1.4319590347592254e-08, "loss": 0.0004, "num_tokens": 75629605.0, "reward": 1.1328125, "reward_std": 0.14545938372612, "rewards/equation_reward_func/mean": 0.1484375, "rewards/equation_reward_func/std": 0.356930136680603, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.7964444444444445, "grad_norm": 0.445609692523617, "kl": 0.326416015625, "learning_rate": 1.3842786657596446e-08, "loss": 0.0003, "num_tokens": 75779673.0, "reward": 1.109375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1328125, "rewards/equation_reward_func/std": 0.3407054841518402, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8, "grad_norm": 0.1929324045282612, "kl": 0.341064453125, "learning_rate": 1.3373830023633597e-08, "loss": 0.0003, "num_tokens": 75929773.0, "reward": 1.109375, "reward_std": 0.03125, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8035555555555556, "grad_norm": 253.4831153847798, "kl": 39.014404296875, "learning_rate": 1.2912736028242777e-08, "loss": 0.0391, "num_tokens": 76079829.0, "reward": 1.015625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8071111111111111, "grad_norm": 0.3941482834711751, "kl": 0.332275390625, "learning_rate": 1.2459519992702311e-08, "loss": 0.0003, "num_tokens": 76229901.0, "reward": 1.1015625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8106666666666666, "grad_norm": 0.44490676712616123, "kl": 0.376220703125, "learning_rate": 1.2014196976521035e-08, "loss": 0.0004, "num_tokens": 76380013.0, "reward": 1.0625, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8142222222222222, "grad_norm": 0.5178719928256352, "kl": 0.30419921875, "learning_rate": 1.1576781776937634e-08, "loss": 0.0003, "num_tokens": 76530081.0, "reward": 1.0546875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8177777777777777, "grad_norm": 0.4231335974252663, "kl": 0.32177734375, "learning_rate": 1.1147288928429116e-08, "loss": 0.0003, "num_tokens": 76680169.0, "reward": 1.0703125, "reward_std": 0.08295938372612, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8213333333333335, "grad_norm": 0.7284463829796036, "kl": 0.338134765625, "learning_rate": 1.0725732702227735e-08, "loss": 0.0003, "num_tokens": 76830181.0, "reward": 1.15625, "reward_std": 0.18879535794258118, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8248888888888888, "grad_norm": 0.2900221752186249, "kl": 0.298828125, "learning_rate": 1.0312127105846947e-08, "loss": 0.0003, "num_tokens": 76980265.0, "reward": 1.0703125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8284444444444445, "grad_norm": 0.6219543064987271, "kl": 0.333740234375, "learning_rate": 9.906485882615695e-09, "loss": 0.0003, "num_tokens": 77130341.0, "reward": 1.1171875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8319999999999999, "grad_norm": 0.433076359193613, "kl": 0.29931640625, "learning_rate": 9.50882251122212e-09, "loss": 0.0003, "num_tokens": 77280453.0, "reward": 1.09375, "reward_std": 0.08537659049034119, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8355555555555556, "grad_norm": 0.49027180300777035, "kl": 0.3251953125, "learning_rate": 9.119150205265324e-09, "loss": 0.0003, "num_tokens": 77430413.0, "reward": 1.0859375, "reward_std": 0.15293270349502563, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8391111111111111, "grad_norm": 0.3625919980341686, "kl": 0.2978515625, "learning_rate": 8.737481912816592e-09, "loss": 0.0003, "num_tokens": 77580525.0, "reward": 0.9921875, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8426666666666667, "grad_norm": 0.3618046565066512, "kl": 0.32958984375, "learning_rate": 8.363830315988945e-09, "loss": 0.0003, "num_tokens": 77730685.0, "reward": 1.0078125, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8462222222222222, "grad_norm": 0.39610604926318627, "kl": 0.30859375, "learning_rate": 7.99820783051583e-09, "loss": 0.0003, "num_tokens": 77880801.0, "reward": 1.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8497777777777777, "grad_norm": 0.4807968539401694, "kl": 0.310546875, "learning_rate": 7.640626605338624e-09, "loss": 0.0003, "num_tokens": 78030881.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8533333333333335, "grad_norm": 0.27107753228441966, "kl": 0.294189453125, "learning_rate": 7.291098522202776e-09, "loss": 0.0003, "num_tokens": 78180985.0, "reward": 1.0078125, "reward_std": 0.046875, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8568888888888888, "grad_norm": 1.2809259822053336, "kl": 0.3994140625, "learning_rate": 6.949635195263259e-09, "loss": 0.0004, "num_tokens": 78331033.0, "reward": 1.1328125, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8604444444444446, "grad_norm": 0.4907122346762557, "kl": 0.29931640625, "learning_rate": 6.616247970698319e-09, "loss": 0.0003, "num_tokens": 78481177.0, "reward": 1.0546875, "reward_std": 0.15866719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.953125, "rewards/format_reward_func/std": 0.21220162510871887, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8639999999999999, "grad_norm": 0.40898622769145226, "kl": 0.30908203125, "learning_rate": 6.290947926332835e-09, "loss": 0.0003, "num_tokens": 78631381.0, "reward": 1.015625, "reward_std": 0.09284991025924683, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8675555555555556, "grad_norm": 0.4863909720236819, "kl": 0.329833984375, "learning_rate": 5.97374587126992e-09, "loss": 0.0003, "num_tokens": 78781545.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.871111111111111, "grad_norm": 0.506380378033068, "kl": 0.39990234375, "learning_rate": 5.664652345531845e-09, "loss": 0.0004, "num_tokens": 78931773.0, "reward": 1.0234375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0546875, "rewards/equation_reward_func/std": 0.22826264798641205, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8746666666666667, "grad_norm": 0.6537426649681487, "kl": 0.31396484375, "learning_rate": 5.363677619709933e-09, "loss": 0.0003, "num_tokens": 79081809.0, "reward": 1.0703125, "reward_std": 0.12850399315357208, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8782222222222222, "grad_norm": 0.3993975373897874, "kl": 0.31591796875, "learning_rate": 5.070831694623135e-09, "loss": 0.0003, "num_tokens": 79231889.0, "reward": 1.0234375, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.03125, "rewards/equation_reward_func/std": 0.1746762990951538, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8817777777777778, "grad_norm": 0.5480289036903339, "kl": 0.3037109375, "learning_rate": 4.786124300985822e-09, "loss": 0.0003, "num_tokens": 79381957.0, "reward": 1.09375, "reward_std": 0.15271097421646118, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8853333333333333, "grad_norm": 0.27684868062560514, "kl": 0.3173828125, "learning_rate": 4.509564899084328e-09, "loss": 0.0003, "num_tokens": 79531997.0, "reward": 1.0390625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.046875, "rewards/equation_reward_func/std": 0.21220162510871887, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8888888888888888, "grad_norm": 0.45768830719657116, "kl": 0.3017578125, "learning_rate": 4.241162678462806e-09, "loss": 0.0003, "num_tokens": 79682017.0, "reward": 1.09375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8924444444444446, "grad_norm": 0.6373373851754117, "kl": 0.343017578125, "learning_rate": 3.9809265576176146e-09, "loss": 0.0003, "num_tokens": 79832021.0, "reward": 1.1171875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.896, "grad_norm": 0.5670410839375055, "kl": 0.344482421875, "learning_rate": 3.7288651837012745e-09, "loss": 0.0003, "num_tokens": 79982069.0, "reward": 1.1015625, "reward_std": 0.1322515904903412, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.8995555555555557, "grad_norm": 0.49575494365537615, "kl": 0.34130859375, "learning_rate": 3.4849869322348126e-09, "loss": 0.0003, "num_tokens": 80132161.0, "reward": 1.078125, "reward_std": 0.10605771839618683, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.903111111111111, "grad_norm": 0.5423541551066062, "kl": 0.290771484375, "learning_rate": 3.249299906829761e-09, "loss": 0.0003, "num_tokens": 80282201.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9066666666666667, "grad_norm": 0.4882483900158141, "kl": 0.30126953125, "learning_rate": 3.0218119389186502e-09, "loss": 0.0003, "num_tokens": 80432285.0, "reward": 1.1328125, "reward_std": 0.11904378235340118, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9102222222222223, "grad_norm": 0.48559011957259535, "kl": 0.32373046875, "learning_rate": 2.8025305874949945e-09, "loss": 0.0003, "num_tokens": 80582353.0, "reward": 1.0546875, "reward_std": 0.11420938372612, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9137777777777778, "grad_norm": 0.38442862283265805, "kl": 0.31689453125, "learning_rate": 2.5914631388619103e-09, "loss": 0.0003, "num_tokens": 80732457.0, "reward": 1.1015625, "reward_std": 0.06491719186306, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9173333333333333, "grad_norm": 0.5139610901110174, "kl": 0.329833984375, "learning_rate": 2.388616606390198e-09, "loss": 0.0003, "num_tokens": 80882525.0, "reward": 1.0859375, "reward_std": 0.10100159049034119, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9609375, "rewards/format_reward_func/std": 0.194504976272583, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9208888888888889, "grad_norm": 0.36707240059240254, "kl": 0.31884765625, "learning_rate": 2.193997730285141e-09, "loss": 0.0003, "num_tokens": 81032593.0, "reward": 1.109375, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9244444444444444, "grad_norm": 0.37143987164406483, "kl": 0.32080078125, "learning_rate": 2.0076129773627103e-09, "loss": 0.0003, "num_tokens": 81182665.0, "reward": 1.125, "reward_std": 0.06733439117670059, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.928, "grad_norm": 0.3334610385311675, "kl": 0.30859375, "learning_rate": 1.8294685408345167e-09, "loss": 0.0003, "num_tokens": 81332765.0, "reward": 1.109375, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9315555555555557, "grad_norm": 0.4965882408774517, "kl": 0.30908203125, "learning_rate": 1.6595703401020844e-09, "loss": 0.0003, "num_tokens": 81482829.0, "reward": 1.0546875, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.935111111111111, "grad_norm": 0.38044746993000517, "kl": 0.287841796875, "learning_rate": 1.497924020560204e-09, "loss": 0.0003, "num_tokens": 81632933.0, "reward": 1.0390625, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9386666666666668, "grad_norm": 0.42008431259677936, "kl": 0.323974609375, "learning_rate": 1.3445349534093598e-09, "loss": 0.0003, "num_tokens": 81782993.0, "reward": 1.1484375, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.15625, "rewards/equation_reward_func/std": 0.3645188808441162, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.942222222222222, "grad_norm": 0.25229812399779317, "kl": 0.29638671875, "learning_rate": 1.199408235477123e-09, "loss": 0.0003, "num_tokens": 81933053.0, "reward": 1.0390625, "reward_std": 0.033667195588350296, "rewards/equation_reward_func/mean": 0.0390625, "rewards/equation_reward_func/std": 0.194504976272583, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9457777777777778, "grad_norm": 0.674870799755692, "kl": 0.35791015625, "learning_rate": 1.0625486890488978e-09, "loss": 0.0004, "num_tokens": 82083173.0, "reward": 1.0625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9493333333333334, "grad_norm": 0.46280529484357197, "kl": 0.3115234375, "learning_rate": 9.339608617077165e-10, "loss": 0.0003, "num_tokens": 82233273.0, "reward": 1.09375, "reward_std": 0.1080445945262909, "rewards/equation_reward_func/mean": 0.1171875, "rewards/equation_reward_func/std": 0.322907418012619, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.952888888888889, "grad_norm": 0.441866278976047, "kl": 0.34423828125, "learning_rate": 8.136490261830553e-10, "loss": 0.0003, "num_tokens": 82383369.0, "reward": 1.0859375, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.1015625, "rewards/equation_reward_func/std": 0.3032590448856354, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9564444444444444, "grad_norm": 0.38249833378737674, "kl": 0.346923828125, "learning_rate": 7.016171802088633e-10, "loss": 0.0003, "num_tokens": 82533397.0, "reward": 1.15625, "reward_std": 0.049292195588350296, "rewards/equation_reward_func/mean": 0.1640625, "rewards/equation_reward_func/std": 0.371787428855896, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.96, "grad_norm": 0.44669554442836734, "kl": 0.310791015625, "learning_rate": 5.978690463908087e-10, "loss": 0.0003, "num_tokens": 82683493.0, "reward": 1.0703125, "reward_std": 0.09616719186306, "rewards/equation_reward_func/mean": 0.0859375, "rewards/equation_reward_func/std": 0.2813730239868164, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9635555555555557, "grad_norm": 0.35527802389757074, "kl": 0.304443359375, "learning_rate": 5.024080720824608e-10, "loss": 0.0003, "num_tokens": 82833597.0, "reward": 1.0703125, "reward_std": 0.08295938372612, "rewards/equation_reward_func/mean": 0.0703125, "rewards/equation_reward_func/std": 0.2566775679588318, "rewards/format_reward_func/mean": 1.0, "rewards/format_reward_func/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.967111111111111, "grad_norm": 0.44800982692930164, "kl": 0.301025390625, "learning_rate": 4.152374292708538e-10, "loss": 0.0003, "num_tokens": 82983697.0, "reward": 1.1015625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.109375, "rewards/equation_reward_func/std": 0.31333550810813904, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9706666666666668, "grad_norm": 0.5383427366059405, "kl": 0.30419921875, "learning_rate": 3.363600144710155e-10, "loss": 0.0003, "num_tokens": 83133789.0, "reward": 1.0390625, "reward_std": 0.08295939117670059, "rewards/equation_reward_func/mean": 0.0625, "rewards/equation_reward_func/std": 0.24301259219646454, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.974222222222222, "grad_norm": 0.43534764469826404, "kl": 0.294921875, "learning_rate": 2.6577844862973877e-10, "loss": 0.0003, "num_tokens": 83283941.0, "reward": 1.0625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9777777777777779, "grad_norm": 0.4111665114904774, "kl": 0.33984375, "learning_rate": 2.0349507703851243e-10, "loss": 0.0003, "num_tokens": 83433985.0, "reward": 1.0625, "reward_std": 0.08054219186306, "rewards/equation_reward_func/mean": 0.078125, "rewards/equation_reward_func/std": 0.2694226801395416, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9813333333333332, "grad_norm": 0.3952216943474001, "kl": 0.344482421875, "learning_rate": 1.4951196925561127e-10, "loss": 0.0003, "num_tokens": 83584037.0, "reward": 1.0859375, "reward_std": 0.05918271467089653, "rewards/equation_reward_func/mean": 0.09375, "rewards/equation_reward_func/std": 0.29262590408325195, "rewards/format_reward_func/mean": 0.9921875, "rewards/format_reward_func/std": 0.0883883461356163, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.984888888888889, "grad_norm": 0.8904195917800815, "kl": 0.38232421875, "learning_rate": 1.0383091903720665e-10, "loss": 0.0004, "num_tokens": 83734133.0, "reward": 1.0078125, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.0234375, "rewards/equation_reward_func/std": 0.15188287198543549, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9884444444444445, "grad_norm": 0.4463127795360864, "kl": 0.299560546875, "learning_rate": 6.645344427794186e-11, "loss": 0.0003, "num_tokens": 83884293.0, "reward": 0.9921875, "reward_std": 0.078125, "rewards/equation_reward_func/mean": 0.015625, "rewards/equation_reward_func/std": 0.12450689822435379, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.992, "grad_norm": 0.4631812630632875, "kl": 0.338623046875, "learning_rate": 3.738078696036151e-11, "loss": 0.0003, "num_tokens": 84034373.0, "reward": 1.109375, "reward_std": 0.09375, "rewards/equation_reward_func/mean": 0.140625, "rewards/equation_reward_func/std": 0.3490002751350403, "rewards/format_reward_func/mean": 0.96875, "rewards/format_reward_func/std": 0.1746762990951538, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.9955555555555555, "grad_norm": 0.5109057359697526, "kl": 0.32275390625, "learning_rate": 1.6613913113694423e-11, "loss": 0.0003, "num_tokens": 84184493.0, "reward": 1.1015625, "reward_std": 0.12741719186306, "rewards/equation_reward_func/mean": 0.125, "rewards/equation_reward_func/std": 0.3320184051990509, "rewards/format_reward_func/mean": 0.9765625, "rewards/format_reward_func/std": 0.15188287198543549, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 1024.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 1024.0, "completions/min_terminated_length": 0.0, "epoch": 1.999111111111111, "grad_norm": 0.3977114295327049, "kl": 0.314697265625, "learning_rate": 4.153512781768231e-12, "loss": 0.0003, "num_tokens": 84334581.0, "reward": 1.15625, "reward_std": 0.0625, "rewards/equation_reward_func/mean": 0.171875, "rewards/equation_reward_func/std": 0.3787541687488556, "rewards/format_reward_func/mean": 0.984375, "rewards/format_reward_func/std": 0.12450689822435379, "step": 562 }, { "epoch": 1.999111111111111, "step": 562, "total_flos": 0.0, "train_loss": 0.006451380790508261, "train_runtime": 14914.6482, "train_samples_per_second": 1.207, "train_steps_per_second": 0.038 } ], "logging_steps": 1, "max_steps": 562, "num_input_tokens_seen": 84334581, "num_train_epochs": 2, "save_steps": 25, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }