{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.999657651489216, "eval_steps": 500, "global_step": 2555, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 371.64453125, "completions/mean_terminated_length": 371.64453125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.00039125544089597497, "frac_reward_zero_std": 0.0, "grad_norm": 1.056653503502096, "kl": 0.0, "learning_rate": 0.0, "loss": -0.0178, "num_tokens": 123237.0, "reward": 0.3212890625, "reward_std": 0.19036339223384857, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.28125, "rewards/format_reward/std": 0.45048993825912476, "rewards/tag_count_reward/mean": 0.6640625, "rewards/tag_count_reward/std": 0.4286750257015228, "step": 1 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 332.43359375, "completions/mean_terminated_length": 332.43359375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0007825108817919499, "frac_reward_zero_std": 0.0625, "grad_norm": 1.5580448384036365, "kl": 0.0, "learning_rate": 7.8125e-08, "loss": -0.0197, "num_tokens": 238324.0, "reward": 0.2015380859375, "reward_std": 0.1863287389278412, "rewards/accuracy_reward/mean": 0.10546875, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.20703125, "rewards/format_reward/std": 0.40597182512283325, "rewards/tag_count_reward/mean": 0.5615234375, "rewards/tag_count_reward/std": 0.45198261737823486, "step": 2 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 351.64453125, "completions/mean_terminated_length": 351.64453125, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.001173766322687925, "frac_reward_zero_std": 0.25, "grad_norm": 0.6901981604604581, "kl": 0.00021028518676757812, "learning_rate": 1.5625e-07, "loss": -0.0026, "num_tokens": 356457.0, "reward": 0.4578857421875, "reward_std": 0.15793009102344513, "rewards/accuracy_reward/mean": 0.38671875, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.1171875, "rewards/format_reward/std": 0.3222736418247223, "rewards/tag_count_reward/mean": 0.4521484375, "rewards/tag_count_reward/std": 0.4541802406311035, "step": 3 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 277.37890625, "completions/mean_terminated_length": 277.37890625, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0015650217635838999, "frac_reward_zero_std": 0.0, "grad_norm": 1.3800461161009432, "kl": 0.00029754638671875, "learning_rate": 2.3437500000000003e-07, "loss": -0.0342, "num_tokens": 456330.0, "reward": 0.46533203125, "reward_std": 0.2509230673313141, "rewards/accuracy_reward/mean": 0.33984375, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.3515625, "rewards/format_reward/std": 0.47839346528053284, "rewards/tag_count_reward/mean": 0.65234375, "rewards/tag_count_reward/std": 0.44251328706741333, "step": 4 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 320.4375, "completions/mean_terminated_length": 320.4375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.0019562772044798746, "frac_reward_zero_std": 0.0, "grad_norm": 1.3728354878081686, "kl": 0.0002491474151611328, "learning_rate": 3.125e-07, "loss": 0.0039, "num_tokens": 567818.0, "reward": 0.2552490234375, "reward_std": 0.22657975554466248, "rewards/accuracy_reward/mean": 0.15625, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.21484375, "rewards/format_reward/std": 0.4115184545516968, "rewards/tag_count_reward/mean": 0.5771484375, "rewards/tag_count_reward/std": 0.4635279178619385, "step": 5 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 788.0, "completions/max_terminated_length": 788.0, "completions/mean_length": 295.4140625, "completions/mean_terminated_length": 295.4140625, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.00234753264537585, "frac_reward_zero_std": 0.0, "grad_norm": 1.0077151661534194, "kl": 0.0003376007080078125, "learning_rate": 3.90625e-07, "loss": -0.015, "num_tokens": 672676.0, "reward": 0.314697265625, "reward_std": 0.27451425790786743, "rewards/accuracy_reward/mean": 0.203125, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.2890625, "rewards/format_reward/std": 0.45421501994132996, "rewards/tag_count_reward/mean": 0.603515625, "rewards/tag_count_reward/std": 0.4499753415584564, "step": 6 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 301.41015625, "completions/mean_terminated_length": 301.41015625, "completions/min_length": 78.0, "completions/min_terminated_length": 78.0, "epoch": 0.0027387880862718246, "frac_reward_zero_std": 0.0, "grad_norm": 1.4727930749184217, "kl": 0.0005757808685302734, "learning_rate": 4.6875000000000006e-07, "loss": 0.0204, "num_tokens": 779821.0, "reward": 0.255615234375, "reward_std": 0.20706875622272491, "rewards/accuracy_reward/mean": 0.08984375, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.48828125, "rewards/format_reward/std": 0.5008418560028076, "rewards/tag_count_reward/mean": 0.837890625, "rewards/tag_count_reward/std": 0.325531005859375, "step": 7 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 789.0, "completions/max_terminated_length": 789.0, "completions/mean_length": 377.3515625, "completions/mean_terminated_length": 377.3515625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.0031300435271677998, "frac_reward_zero_std": 0.0625, "grad_norm": 1.0838184278340066, "kl": 0.0004899501800537109, "learning_rate": 5.468750000000001e-07, "loss": 0.0079, "num_tokens": 905671.0, "reward": 0.1961669921875, "reward_std": 0.20801988244056702, "rewards/accuracy_reward/mean": 0.0625, "rewards/accuracy_reward/std": 0.24253563582897186, "rewards/format_reward/mean": 0.359375, "rewards/format_reward/std": 0.4807571768760681, "rewards/tag_count_reward/mean": 0.7099609375, "rewards/tag_count_reward/std": 0.4282985031604767, "step": 8 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 762.0, "completions/max_terminated_length": 762.0, "completions/mean_length": 287.8359375, "completions/mean_terminated_length": 287.8359375, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.0035212989680637745, "frac_reward_zero_std": 0.0, "grad_norm": 0.7534690863890925, "kl": 0.0017294883728027344, "learning_rate": 6.25e-07, "loss": 0.0091, "num_tokens": 1007725.0, "reward": 0.3004150390625, "reward_std": 0.27198725938796997, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.61328125, "rewards/format_reward/std": 0.4879522919654846, "rewards/tag_count_reward/mean": 0.8525390625, "rewards/tag_count_reward/std": 0.3253706693649292, "step": 9 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 292.60546875, "completions/mean_terminated_length": 292.60546875, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.003912554408959749, "frac_reward_zero_std": 0.0, "grad_norm": 1.0251191478515131, "kl": 0.0020313262939453125, "learning_rate": 7.03125e-07, "loss": -0.0038, "num_tokens": 1112232.0, "reward": 0.337158203125, "reward_std": 0.17504331469535828, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.6953125, "rewards/format_reward/std": 0.4611765742301941, "rewards/tag_count_reward/mean": 0.876953125, "rewards/tag_count_reward/std": 0.2591390311717987, "step": 10 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 298.15234375, "completions/mean_terminated_length": 298.15234375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.0043038098498557244, "frac_reward_zero_std": 0.0, "grad_norm": 0.6165330818030933, "kl": 0.0031337738037109375, "learning_rate": 7.8125e-07, "loss": 0.0109, "num_tokens": 1217983.0, "reward": 0.3399658203125, "reward_std": 0.2755941152572632, "rewards/accuracy_reward/mean": 0.140625, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.7265625, "rewards/format_reward/std": 0.446596622467041, "rewards/tag_count_reward/mean": 0.8681640625, "rewards/tag_count_reward/std": 0.29778948426246643, "step": 11 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 300.96484375, "completions/mean_terminated_length": 300.96484375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.0046950652907517, "frac_reward_zero_std": 0.0, "grad_norm": 0.8577544854575933, "kl": 0.00449371337890625, "learning_rate": 8.59375e-07, "loss": -0.0089, "num_tokens": 1325718.0, "reward": 0.3336181640625, "reward_std": 0.19202852249145508, "rewards/accuracy_reward/mean": 0.109375, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.84375, "rewards/format_reward/std": 0.3638034462928772, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.13543669879436493, "step": 12 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 320.85546875, "completions/mean_terminated_length": 320.85546875, "completions/min_length": 83.0, "completions/min_terminated_length": 83.0, "epoch": 0.005086320731647675, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5897809460219293, "kl": 0.007358551025390625, "learning_rate": 9.375000000000001e-07, "loss": 0.0211, "num_tokens": 1436641.0, "reward": 0.5096435546875, "reward_std": 0.40089431405067444, "rewards/accuracy_reward/mean": 0.27734375, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2865179479122162, "rewards/tag_count_reward/mean": 0.9482421875, "rewards/tag_count_reward/std": 0.19047628343105316, "step": 13 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 644.0, "completions/max_terminated_length": 644.0, "completions/mean_length": 313.6796875, "completions/mean_terminated_length": 313.6796875, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.005477576172543649, "frac_reward_zero_std": 0.125, "grad_norm": 0.649861379027964, "kl": 0.00589752197265625, "learning_rate": 1.0156250000000001e-06, "loss": -0.0201, "num_tokens": 1548239.0, "reward": 0.357666015625, "reward_std": 0.22131773829460144, "rewards/accuracy_reward/mean": 0.125, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3026638329029083, "rewards/tag_count_reward/mean": 0.962890625, "rewards/tag_count_reward/std": 0.12163044512271881, "step": 14 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 767.0, "completions/max_terminated_length": 767.0, "completions/mean_length": 297.19140625, "completions/mean_terminated_length": 297.19140625, "completions/min_length": 81.0, "completions/min_terminated_length": 81.0, "epoch": 0.005868831613439624, "frac_reward_zero_std": 0.0625, "grad_norm": 0.6353296432014309, "kl": 0.0031452178955078125, "learning_rate": 1.0937500000000001e-06, "loss": -0.0124, "num_tokens": 1656160.0, "reward": 0.3914794921875, "reward_std": 0.27253425121307373, "rewards/accuracy_reward/mean": 0.15234375, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.94140625, "rewards/format_reward/std": 0.23532284796237946, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.11474059522151947, "step": 15 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 622.0, "completions/max_terminated_length": 622.0, "completions/mean_length": 306.09765625, "completions/mean_terminated_length": 306.09765625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.0062600870543355995, "frac_reward_zero_std": 0.125, "grad_norm": 0.5394686912485336, "kl": 0.0040454864501953125, "learning_rate": 1.1718750000000001e-06, "loss": -0.0179, "num_tokens": 1764553.0, "reward": 0.3548583984375, "reward_std": 0.228677898645401, "rewards/accuracy_reward/mean": 0.1171875, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.9296875, "rewards/format_reward/std": 0.2561737895011902, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.11038573831319809, "step": 16 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 805.0, "completions/max_terminated_length": 805.0, "completions/mean_length": 315.94921875, "completions/mean_terminated_length": 315.94921875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.006651342495231575, "frac_reward_zero_std": 0.125, "grad_norm": 0.5469356744036993, "kl": 0.005649566650390625, "learning_rate": 1.25e-06, "loss": 0.005, "num_tokens": 1873948.0, "reward": 0.4365234375, "reward_std": 0.2897738218307495, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.11897231638431549, "step": 17 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 331.87109375, "completions/mean_terminated_length": 331.87109375, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.007042597936127549, "frac_reward_zero_std": 0.0, "grad_norm": 0.6315361392010271, "kl": 0.006923675537109375, "learning_rate": 1.328125e-06, "loss": 0.0219, "num_tokens": 1989659.0, "reward": 0.43701171875, "reward_std": 0.27888959646224976, "rewards/accuracy_reward/mean": 0.1953125, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.07248647511005402, "step": 18 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 632.0, "completions/max_terminated_length": 632.0, "completions/mean_length": 280.734375, "completions/mean_terminated_length": 280.734375, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.007433853377023524, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5921807094961375, "kl": 0.012939453125, "learning_rate": 1.40625e-06, "loss": -0.0169, "num_tokens": 2089463.0, "reward": 0.4757080078125, "reward_std": 0.3893081843852997, "rewards/accuracy_reward/mean": 0.234375, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.9814453125, "rewards/tag_count_reward/std": 0.10336240381002426, "step": 19 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 380.8515625, "completions/mean_terminated_length": 380.8515625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.007825108817919499, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4476245303069078, "kl": 0.0103759765625, "learning_rate": 1.484375e-06, "loss": 0.0087, "num_tokens": 2216321.0, "reward": 0.4844970703125, "reward_std": 0.30757975578308105, "rewards/accuracy_reward/mean": 0.25, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 0.921875, "rewards/format_reward/std": 0.26889389753341675, "rewards/tag_count_reward/mean": 0.9541015625, "rewards/tag_count_reward/std": 0.16888554394245148, "step": 20 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 677.0, "completions/max_terminated_length": 677.0, "completions/mean_length": 312.48828125, "completions/mean_terminated_length": 312.48828125, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.008216364258815475, "frac_reward_zero_std": 0.125, "grad_norm": 0.6305615713099669, "kl": 0.01300048828125, "learning_rate": 1.5625e-06, "loss": -0.0019, "num_tokens": 2327454.0, "reward": 0.4166259765625, "reward_std": 0.27601829171180725, "rewards/accuracy_reward/mean": 0.171875, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9814453125, "rewards/tag_count_reward/std": 0.11672601103782654, "step": 21 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 809.0, "completions/max_terminated_length": 809.0, "completions/mean_length": 337.9453125, "completions/mean_terminated_length": 337.9453125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.008607619699711449, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5615736883892908, "kl": 0.0142669677734375, "learning_rate": 1.640625e-06, "loss": -0.0151, "num_tokens": 2443136.0, "reward": 0.500244140625, "reward_std": 0.3281666040420532, "rewards/accuracy_reward/mean": 0.26171875, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.970703125, "rewards/tag_count_reward/std": 0.1276526153087616, "step": 22 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 335.328125, "completions/mean_terminated_length": 335.328125, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.008998875140607425, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5740573673098712, "kl": 0.0157470703125, "learning_rate": 1.71875e-06, "loss": 0.0146, "num_tokens": 2556212.0, "reward": 0.6429443359375, "reward_std": 0.40205246210098267, "rewards/accuracy_reward/mean": 0.40234375, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.09031175076961517, "step": 23 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 662.0, "completions/max_terminated_length": 662.0, "completions/mean_length": 333.671875, "completions/mean_terminated_length": 333.671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.0093901305815034, "frac_reward_zero_std": 0.125, "grad_norm": 0.5605945367179157, "kl": 0.01568603515625, "learning_rate": 1.796875e-06, "loss": 0.0301, "num_tokens": 2670656.0, "reward": 0.6842041015625, "reward_std": 0.3044005036354065, "rewards/accuracy_reward/mean": 0.44140625, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.0800139307975769, "step": 24 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1190.0, "completions/max_terminated_length": 1190.0, "completions/mean_length": 412.0703125, "completions/mean_terminated_length": 412.0703125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.009781386022399374, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4843329868049502, "kl": 0.0188446044921875, "learning_rate": 1.8750000000000003e-06, "loss": 0.0001, "num_tokens": 2804434.0, "reward": 0.8106689453125, "reward_std": 0.3731144964694977, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.92578125, "rewards/format_reward/std": 0.2626400291919708, "rewards/tag_count_reward/mean": 0.9658203125, "rewards/tag_count_reward/std": 0.1293032318353653, "step": 25 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1535.0, "completions/max_terminated_length": 1535.0, "completions/mean_length": 364.12109375, "completions/mean_terminated_length": 364.12109375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.01017264146329535, "frac_reward_zero_std": 0.0, "grad_norm": 0.510937019588302, "kl": 0.0231475830078125, "learning_rate": 1.953125e-06, "loss": 0.0157, "num_tokens": 2925953.0, "reward": 0.7669677734375, "reward_std": 0.3331727087497711, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2865179479122162, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.0775839164853096, "step": 26 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 912.0, "completions/max_terminated_length": 912.0, "completions/mean_length": 349.58203125, "completions/mean_terminated_length": 349.58203125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.010563896904191324, "frac_reward_zero_std": 0.0, "grad_norm": 0.5265116045490308, "kl": 0.0306396484375, "learning_rate": 2.0312500000000002e-06, "loss": 0.0025, "num_tokens": 3044198.0, "reward": 0.8663330078125, "reward_std": 0.4105108678340912, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.06738705933094025, "step": 27 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1084.0, "completions/max_terminated_length": 1084.0, "completions/mean_length": 438.203125, "completions/mean_terminated_length": 438.203125, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.010955152345087298, "frac_reward_zero_std": 0.0625, "grad_norm": 0.5107103498391776, "kl": 0.0255279541015625, "learning_rate": 2.109375e-06, "loss": -0.0128, "num_tokens": 3186266.0, "reward": 0.9063720703125, "reward_std": 0.27118170261383057, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.10635717958211899, "step": 28 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 399.88671875, "completions/mean_terminated_length": 399.88671875, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.011346407785983274, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4490425239005144, "kl": 0.0359344482421875, "learning_rate": 2.1875000000000002e-06, "loss": -0.0106, "num_tokens": 3317885.0, "reward": 0.9732666015625, "reward_std": 0.2866381108760834, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.07363312691450119, "step": 29 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 918.0, "completions/max_terminated_length": 918.0, "completions/mean_length": 400.96875, "completions/mean_terminated_length": 400.96875, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.011737663226879249, "frac_reward_zero_std": 0.125, "grad_norm": 0.46856722498250697, "kl": 0.054107666015625, "learning_rate": 2.265625e-06, "loss": 0.0162, "num_tokens": 3452853.0, "reward": 1.104248046875, "reward_std": 0.20389705896377563, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.05334262177348137, "step": 30 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 855.0, "completions/max_terminated_length": 855.0, "completions/mean_length": 462.81640625, "completions/mean_terminated_length": 462.81640625, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.012128918667775223, "frac_reward_zero_std": 0.5, "grad_norm": 0.29759009051559965, "kl": 0.034332275390625, "learning_rate": 2.3437500000000002e-06, "loss": -0.0074, "num_tokens": 3600934.0, "reward": 1.1375732421875, "reward_std": 0.12805725634098053, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 31 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 411.9140625, "completions/mean_terminated_length": 411.9140625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.012520174108671199, "frac_reward_zero_std": 0.25, "grad_norm": 0.4255624770789882, "kl": 0.0452880859375, "learning_rate": 2.421875e-06, "loss": 0.0266, "num_tokens": 3736880.0, "reward": 1.1656494140625, "reward_std": 0.18115057051181793, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.0830206349492073, "step": 32 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 983.0, "completions/max_terminated_length": 983.0, "completions/mean_length": 448.890625, "completions/mean_terminated_length": 448.890625, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.012911429549567173, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3474984446945337, "kl": 0.0399169921875, "learning_rate": 2.5e-06, "loss": 0.0317, "num_tokens": 3883524.0, "reward": 1.1275634765625, "reward_std": 0.14843381941318512, "rewards/accuracy_reward/mean": 0.8828125, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.09666129946708679, "step": 33 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 435.921875, "completions/mean_terminated_length": 435.921875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.01330268499046315, "frac_reward_zero_std": 0.3125, "grad_norm": 0.36820596722721366, "kl": 0.044830322265625, "learning_rate": 2.5781250000000004e-06, "loss": 0.0416, "num_tokens": 4024096.0, "reward": 1.041015625, "reward_std": 0.16744205355644226, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06537505239248276, "step": 34 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 395.62109375, "completions/mean_terminated_length": 395.62109375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.013693940431359124, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4144417783667151, "kl": 0.060150146484375, "learning_rate": 2.65625e-06, "loss": 0.023, "num_tokens": 4155695.0, "reward": 0.9661865234375, "reward_std": 0.08885198831558228, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.07060634344816208, "step": 35 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 429.9765625, "completions/mean_terminated_length": 429.9765625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.014085195872255098, "frac_reward_zero_std": 0.4375, "grad_norm": 0.33111091924433284, "kl": 0.053619384765625, "learning_rate": 2.7343750000000004e-06, "loss": 0.036, "num_tokens": 4295193.0, "reward": 1.0474853515625, "reward_std": 0.14912518858909607, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0464647077023983, "step": 36 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 427.26171875, "completions/mean_terminated_length": 427.26171875, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.014476451313151074, "frac_reward_zero_std": 0.5, "grad_norm": 0.4120151929850403, "kl": 0.058441162109375, "learning_rate": 2.8125e-06, "loss": 0.0088, "num_tokens": 4434716.0, "reward": 1.06396484375, "reward_std": 0.10219889879226685, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06213126704096794, "step": 37 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 833.0, "completions/max_terminated_length": 833.0, "completions/mean_length": 382.15234375, "completions/mean_terminated_length": 382.15234375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.014867706754047048, "frac_reward_zero_std": 0.625, "grad_norm": 0.29605824858735524, "kl": 0.060546875, "learning_rate": 2.8906250000000004e-06, "loss": 0.0233, "num_tokens": 4563363.0, "reward": 1.1953125, "reward_std": 0.11612267792224884, "rewards/accuracy_reward/mean": 0.9453125, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 38 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 743.0, "completions/max_terminated_length": 743.0, "completions/mean_length": 344.4453125, "completions/mean_terminated_length": 344.4453125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.015258962194943023, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3675795192258731, "kl": 0.07659912109375, "learning_rate": 2.96875e-06, "loss": 0.0186, "num_tokens": 4678037.0, "reward": 1.0341796875, "reward_std": 0.06882990896701813, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.4115184545516968, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0625, "step": 39 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 361.85546875, "completions/mean_terminated_length": 361.85546875, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.015650217635838997, "frac_reward_zero_std": 0.8125, "grad_norm": 0.24473603001114894, "kl": 0.0628662109375, "learning_rate": 3.0468750000000004e-06, "loss": 0.0243, "num_tokens": 4798512.0, "reward": 1.171875, "reward_std": 0.05259781330823898, "rewards/accuracy_reward/mean": 0.921875, "rewards/accuracy_reward/std": 0.26889389753341675, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 40 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 374.1171875, "completions/mean_terminated_length": 374.1171875, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.016041473076734973, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2427737308783706, "kl": 0.061309814453125, "learning_rate": 3.125e-06, "loss": 0.0123, "num_tokens": 4924382.0, "reward": 1.1673583984375, "reward_std": 0.04601725563406944, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 41 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 334.8984375, "completions/mean_terminated_length": 334.8984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.01643272851763095, "frac_reward_zero_std": 0.875, "grad_norm": 0.2044506293241314, "kl": 0.0718994140625, "learning_rate": 3.2031250000000004e-06, "loss": -0.0054, "num_tokens": 5039988.0, "reward": 1.1126708984375, "reward_std": 0.037000760436058044, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 42 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 611.0, "completions/max_terminated_length": 611.0, "completions/mean_length": 341.0, "completions/mean_terminated_length": 341.0, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.01682398395852692, "frac_reward_zero_std": 0.75, "grad_norm": 0.22134808524368088, "kl": 0.064300537109375, "learning_rate": 3.28125e-06, "loss": -0.0045, "num_tokens": 5158180.0, "reward": 1.09375, "reward_std": 0.0767945945262909, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 43 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 340.1171875, "completions/mean_terminated_length": 340.1171875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.017215239399422898, "frac_reward_zero_std": 0.625, "grad_norm": 0.3540463853586698, "kl": 0.064300537109375, "learning_rate": 3.3593750000000003e-06, "loss": 0.0244, "num_tokens": 5274002.0, "reward": 1.221923828125, "reward_std": 0.09008685499429703, "rewards/accuracy_reward/mean": 0.97265625, "rewards/accuracy_reward/std": 0.1634024828672409, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 44 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 832.0, "completions/max_terminated_length": 832.0, "completions/mean_length": 327.9453125, "completions/mean_terminated_length": 327.9453125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.017606494840318874, "frac_reward_zero_std": 0.6875, "grad_norm": 0.34074195784894107, "kl": 0.0738525390625, "learning_rate": 3.4375e-06, "loss": 0.0159, "num_tokens": 5388084.0, "reward": 1.1484375, "reward_std": 0.09375, "rewards/accuracy_reward/mean": 0.8984375, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 45 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 745.0, "completions/max_terminated_length": 745.0, "completions/mean_length": 356.88671875, "completions/mean_terminated_length": 356.88671875, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.01799775028121485, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3015637530367123, "kl": 0.064208984375, "learning_rate": 3.5156250000000003e-06, "loss": 0.0153, "num_tokens": 5510359.0, "reward": 1.21875, "reward_std": 0.09341736882925034, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 46 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 661.0, "completions/max_terminated_length": 661.0, "completions/mean_length": 311.22265625, "completions/mean_terminated_length": 311.22265625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.018389005722110822, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2176617231360478, "kl": 0.07904052734375, "learning_rate": 3.59375e-06, "loss": 0.0149, "num_tokens": 5620736.0, "reward": 1.23828125, "reward_std": 0.046875, "rewards/accuracy_reward/mean": 0.98828125, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 47 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 309.78515625, "completions/mean_terminated_length": 309.78515625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.0187802611630068, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3955769546628962, "kl": 0.07366943359375, "learning_rate": 3.6718750000000003e-06, "loss": 0.0564, "num_tokens": 5728537.0, "reward": 1.21875, "reward_std": 0.11509781330823898, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 48 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 657.0, "completions/max_terminated_length": 657.0, "completions/mean_length": 279.34765625, "completions/mean_terminated_length": 279.34765625, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.019171516603902775, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2191854603581709, "kl": 0.08612060546875, "learning_rate": 3.7500000000000005e-06, "loss": 0.0056, "num_tokens": 5829378.0, "reward": 1.17578125, "reward_std": 0.046875, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 49 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 947.0, "completions/max_terminated_length": 947.0, "completions/mean_length": 364.01953125, "completions/mean_terminated_length": 364.01953125, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.019562772044798747, "frac_reward_zero_std": 0.75, "grad_norm": 0.23609139081398942, "kl": 0.059906005859375, "learning_rate": 3.828125000000001e-06, "loss": 0.0086, "num_tokens": 5951799.0, "reward": 1.2100830078125, "reward_std": 0.07241250574588776, "rewards/accuracy_reward/mean": 0.9609375, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 50 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 292.16796875, "completions/mean_terminated_length": 292.16796875, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.019954027485694723, "frac_reward_zero_std": 0.5, "grad_norm": 0.7742888060005116, "kl": 0.0869140625, "learning_rate": 3.90625e-06, "loss": 0.0148, "num_tokens": 6056290.0, "reward": 1.1900634765625, "reward_std": 0.13711875677108765, "rewards/accuracy_reward/mean": 0.94140625, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 51 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 716.0, "completions/max_terminated_length": 716.0, "completions/mean_length": 304.48046875, "completions/mean_terminated_length": 304.48046875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.0203452829265907, "frac_reward_zero_std": 0.75, "grad_norm": 0.4979847722829895, "kl": 0.10369873046875, "learning_rate": 3.984375e-06, "loss": 0.007, "num_tokens": 6164717.0, "reward": 1.1746826171875, "reward_std": 0.03980960324406624, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0808708667755127, "step": 52 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 316.12890625, "completions/mean_terminated_length": 316.12890625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.020736538367486672, "frac_reward_zero_std": 0.625, "grad_norm": 0.4035033482598689, "kl": 0.0843505859375, "learning_rate": 4.0625000000000005e-06, "loss": 0.0187, "num_tokens": 6273934.0, "reward": 1.1590576171875, "reward_std": 0.09208908677101135, "rewards/accuracy_reward/mean": 0.91015625, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 53 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 641.0, "completions/max_terminated_length": 641.0, "completions/mean_length": 333.59765625, "completions/mean_terminated_length": 333.59765625, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.021127793808382648, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3012588078542073, "kl": 0.06695556640625, "learning_rate": 4.140625000000001e-06, "loss": 0.0116, "num_tokens": 6390439.0, "reward": 1.16796875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 54 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 587.0, "completions/max_terminated_length": 587.0, "completions/mean_length": 273.84375, "completions/mean_terminated_length": 273.84375, "completions/min_length": 89.0, "completions/min_terminated_length": 89.0, "epoch": 0.021519049249278624, "frac_reward_zero_std": 0.8125, "grad_norm": 0.20657315327114795, "kl": 0.0809326171875, "learning_rate": 4.21875e-06, "loss": -0.0071, "num_tokens": 6489999.0, "reward": 1.234375, "reward_std": 0.05259781330823898, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 55 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 334.48046875, "completions/mean_terminated_length": 334.48046875, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.021910304690174597, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2906753426203306, "kl": 0.065185546875, "learning_rate": 4.296875e-06, "loss": 0.0174, "num_tokens": 6607178.0, "reward": 1.1015625, "reward_std": 0.10189647972583771, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 56 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 687.0, "completions/max_terminated_length": 687.0, "completions/mean_length": 293.68359375, "completions/mean_terminated_length": 293.68359375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.022301560131070573, "frac_reward_zero_std": 0.625, "grad_norm": 0.4045560074456969, "kl": 0.073486328125, "learning_rate": 4.3750000000000005e-06, "loss": 0.0111, "num_tokens": 6711625.0, "reward": 1.0859375, "reward_std": 0.11476518213748932, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 57 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 640.0, "completions/max_terminated_length": 640.0, "completions/mean_length": 251.9453125, "completions/mean_terminated_length": 251.9453125, "completions/min_length": 114.0, "completions/min_terminated_length": 114.0, "epoch": 0.02269281557196655, "frac_reward_zero_std": 1.0, "grad_norm": 0.041681500850313746, "kl": 0.0821533203125, "learning_rate": 4.453125000000001e-06, "loss": 0.0008, "num_tokens": 6805675.0, "reward": 1.125, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 58 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 306.78125, "completions/mean_terminated_length": 306.78125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.02308407101286252, "frac_reward_zero_std": 0.75, "grad_norm": 0.2866526348905406, "kl": 0.06268310546875, "learning_rate": 4.53125e-06, "loss": 0.0059, "num_tokens": 6913939.0, "reward": 1.162841796875, "reward_std": 0.05042741075158119, "rewards/accuracy_reward/mean": 0.9140625, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 59 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 722.0, "completions/max_terminated_length": 722.0, "completions/mean_length": 295.8671875, "completions/mean_terminated_length": 295.8671875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.023475326453758497, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2147298127512319, "kl": 0.056610107421875, "learning_rate": 4.609375e-06, "loss": 0.0095, "num_tokens": 7020353.0, "reward": 1.0384521484375, "reward_std": 0.04798600450158119, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 60 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 626.0, "completions/max_terminated_length": 626.0, "completions/mean_length": 270.15625, "completions/mean_terminated_length": 270.15625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.023866581894654473, "frac_reward_zero_std": 0.8125, "grad_norm": 0.3092020812507572, "kl": 0.066009521484375, "learning_rate": 4.6875000000000004e-06, "loss": 0.0043, "num_tokens": 7119145.0, "reward": 1.09765625, "reward_std": 0.06116959825158119, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 61 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 282.71484375, "completions/mean_terminated_length": 282.71484375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.024257837335550446, "frac_reward_zero_std": 0.875, "grad_norm": 0.24993969018740594, "kl": 0.072021484375, "learning_rate": 4.765625000000001e-06, "loss": -0.0022, "num_tokens": 7221376.0, "reward": 1.05078125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 62 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 636.0, "completions/max_terminated_length": 636.0, "completions/mean_length": 273.43359375, "completions/mean_terminated_length": 273.43359375, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.024649092776446422, "frac_reward_zero_std": 0.8125, "grad_norm": 0.27850439311237696, "kl": 0.0723876953125, "learning_rate": 4.84375e-06, "loss": 0.027, "num_tokens": 7320735.0, "reward": 1.1953125, "reward_std": 0.05920084938406944, "rewards/accuracy_reward/mean": 0.9453125, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 63 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 669.0, "completions/max_terminated_length": 669.0, "completions/mean_length": 314.609375, "completions/mean_terminated_length": 314.609375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.025040348217342398, "frac_reward_zero_std": 0.8125, "grad_norm": 0.23600631896435836, "kl": 0.07183837890625, "learning_rate": 4.921875e-06, "loss": 0.0032, "num_tokens": 7430459.0, "reward": 1.140625, "reward_std": 0.06899453699588776, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 64 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 308.16796875, "completions/mean_terminated_length": 308.16796875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.025431603658238374, "frac_reward_zero_std": 0.875, "grad_norm": 0.2244397848317128, "kl": 0.061920166015625, "learning_rate": 5e-06, "loss": 0.0183, "num_tokens": 7538998.0, "reward": 1.135498046875, "reward_std": 0.06088460981845856, "rewards/accuracy_reward/mean": 0.88671875, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 65 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 579.0, "completions/max_terminated_length": 579.0, "completions/mean_length": 303.08203125, "completions/mean_terminated_length": 303.08203125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.025822859099134347, "frac_reward_zero_std": 0.75, "grad_norm": 0.2834506921433905, "kl": 0.056549072265625, "learning_rate": 5.078125000000001e-06, "loss": 0.0031, "num_tokens": 7645147.0, "reward": 1.10546875, "reward_std": 0.06822281330823898, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 66 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 604.0, "completions/max_terminated_length": 604.0, "completions/mean_length": 330.15234375, "completions/mean_terminated_length": 330.15234375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.026214114540030323, "frac_reward_zero_std": 0.75, "grad_norm": 0.2583409289341213, "kl": 0.0560302734375, "learning_rate": 5.156250000000001e-06, "loss": -0.0007, "num_tokens": 7758882.0, "reward": 1.02734375, "reward_std": 0.08251741528511047, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 67 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 573.0, "completions/max_terminated_length": 573.0, "completions/mean_length": 312.36328125, "completions/mean_terminated_length": 312.36328125, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.0266053699809263, "frac_reward_zero_std": 0.75, "grad_norm": 0.24007494502022586, "kl": 0.061981201171875, "learning_rate": 5.234375e-06, "loss": 0.0053, "num_tokens": 7868543.0, "reward": 1.15625, "reward_std": 0.08054865896701813, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 68 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 862.0, "completions/max_terminated_length": 862.0, "completions/mean_length": 341.28515625, "completions/mean_terminated_length": 341.28515625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.02699662542182227, "frac_reward_zero_std": 0.75, "grad_norm": 0.26328819351234345, "kl": 0.0594482421875, "learning_rate": 5.3125e-06, "loss": 0.0206, "num_tokens": 7985208.0, "reward": 1.07421875, "reward_std": 0.08461953699588776, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 69 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1180.0, "completions/max_terminated_length": 1180.0, "completions/mean_length": 332.31640625, "completions/mean_terminated_length": 332.31640625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.027387880862718247, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1342705874407388, "kl": 0.063201904296875, "learning_rate": 5.390625000000001e-06, "loss": -0.0018, "num_tokens": 8099193.0, "reward": 1.14453125, "reward_std": 0.029919598251581192, "rewards/accuracy_reward/mean": 0.89453125, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 70 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 565.0, "completions/max_terminated_length": 565.0, "completions/mean_length": 350.25390625, "completions/mean_terminated_length": 350.25390625, "completions/min_length": 156.0, "completions/min_terminated_length": 156.0, "epoch": 0.027779136303614223, "frac_reward_zero_std": 0.875, "grad_norm": 0.19691347208684246, "kl": 0.055755615234375, "learning_rate": 5.468750000000001e-06, "loss": 0.0022, "num_tokens": 8217626.0, "reward": 1.23828125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward/mean": 0.98828125, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 71 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 345.1328125, "completions/mean_terminated_length": 345.1328125, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.028170391744510196, "frac_reward_zero_std": 0.875, "grad_norm": 0.2436623751689625, "kl": 0.087799072265625, "learning_rate": 5.546875e-06, "loss": 0.001, "num_tokens": 8334732.0, "reward": 1.1171875, "reward_std": 0.03125, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 72 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 598.0, "completions/max_terminated_length": 598.0, "completions/mean_length": 335.578125, "completions/mean_terminated_length": 335.578125, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.028561647185406172, "frac_reward_zero_std": 0.625, "grad_norm": 0.32834796047579556, "kl": 0.069000244140625, "learning_rate": 5.625e-06, "loss": 0.0047, "num_tokens": 8450448.0, "reward": 1.1328125, "reward_std": 0.12466736882925034, "rewards/accuracy_reward/mean": 0.8828125, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 73 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 880.0, "completions/max_terminated_length": 880.0, "completions/mean_length": 362.5625, "completions/mean_terminated_length": 362.5625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.028952902626302148, "frac_reward_zero_std": 0.75, "grad_norm": 0.251227899413881, "kl": 0.06622314453125, "learning_rate": 5.7031250000000006e-06, "loss": 0.0127, "num_tokens": 8573632.0, "reward": 1.2265625, "reward_std": 0.07394562661647797, "rewards/accuracy_reward/mean": 0.9765625, "rewards/accuracy_reward/std": 0.15158477425575256, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 74 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 806.0, "completions/max_terminated_length": 806.0, "completions/mean_length": 334.59375, "completions/mean_terminated_length": 334.59375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.02934415806719812, "frac_reward_zero_std": 0.75, "grad_norm": 0.22670865477618635, "kl": 0.0703125, "learning_rate": 5.781250000000001e-06, "loss": 0.0066, "num_tokens": 8688632.0, "reward": 1.10546875, "reward_std": 0.06822281330823898, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 75 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 378.99609375, "completions/mean_terminated_length": 378.99609375, "completions/min_length": 203.0, "completions/min_terminated_length": 203.0, "epoch": 0.029735413508094097, "frac_reward_zero_std": 0.5625, "grad_norm": 0.32221532036368367, "kl": 0.053375244140625, "learning_rate": 5.859375e-06, "loss": 0.0082, "num_tokens": 8815447.0, "reward": 1.125, "reward_std": 0.14635254442691803, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 76 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 623.0, "completions/max_terminated_length": 623.0, "completions/mean_length": 328.953125, "completions/mean_terminated_length": 328.953125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.030126668948990073, "frac_reward_zero_std": 0.8125, "grad_norm": 0.23962492787567743, "kl": 0.0650634765625, "learning_rate": 5.9375e-06, "loss": 0.0048, "num_tokens": 8931003.0, "reward": 1.119873046875, "reward_std": 0.0205078125, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 77 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 674.0, "completions/max_terminated_length": 674.0, "completions/mean_length": 312.0625, "completions/mean_terminated_length": 312.0625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.030517924389886045, "frac_reward_zero_std": 0.8125, "grad_norm": 0.26697410164225055, "kl": 0.070343017578125, "learning_rate": 6.0156250000000005e-06, "loss": 0.009, "num_tokens": 9039819.0, "reward": 1.23046875, "reward_std": 0.05644455552101135, "rewards/accuracy_reward/mean": 0.98046875, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 78 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 812.0, "completions/max_terminated_length": 812.0, "completions/mean_length": 362.578125, "completions/mean_terminated_length": 362.578125, "completions/min_length": 144.0, "completions/min_terminated_length": 144.0, "epoch": 0.03090917983078202, "frac_reward_zero_std": 0.8125, "grad_norm": 0.20427987757614013, "kl": 0.060699462890625, "learning_rate": 6.093750000000001e-06, "loss": 0.0009, "num_tokens": 9162127.0, "reward": 1.233154296875, "reward_std": 0.04503815248608589, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 79 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 584.0, "completions/max_terminated_length": 584.0, "completions/mean_length": 279.89453125, "completions/mean_terminated_length": 279.89453125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.031300435271677994, "frac_reward_zero_std": 0.5625, "grad_norm": 0.36809480332259914, "kl": 0.0728759765625, "learning_rate": 6.171875e-06, "loss": -0.0093, "num_tokens": 9261988.0, "reward": 1.09912109375, "reward_std": 0.08028796315193176, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 80 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 891.0, "completions/max_terminated_length": 891.0, "completions/mean_length": 313.77734375, "completions/mean_terminated_length": 313.77734375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.031691690712573974, "frac_reward_zero_std": 0.8125, "grad_norm": 0.22380499136186652, "kl": 0.067047119140625, "learning_rate": 6.25e-06, "loss": 0.0025, "num_tokens": 9371515.0, "reward": 1.0567626953125, "reward_std": 0.02053576149046421, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 81 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 305.26953125, "completions/mean_terminated_length": 305.26953125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.032082946153469946, "frac_reward_zero_std": 0.875, "grad_norm": 0.20448208385474037, "kl": 0.0716552734375, "learning_rate": 6.3281250000000005e-06, "loss": 0.0093, "num_tokens": 9477936.0, "reward": 1.23828125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward/mean": 0.98828125, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 82 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 608.0, "completions/max_terminated_length": 608.0, "completions/mean_length": 284.796875, "completions/mean_terminated_length": 284.796875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.03247420159436592, "frac_reward_zero_std": 0.875, "grad_norm": 0.2793180931475642, "kl": 0.10919189453125, "learning_rate": 6.406250000000001e-06, "loss": 0.0051, "num_tokens": 9578940.0, "reward": 1.1171875, "reward_std": 0.03125, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 83 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 630.0, "completions/max_terminated_length": 630.0, "completions/mean_length": 289.52734375, "completions/mean_terminated_length": 289.52734375, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.0328654570352619, "frac_reward_zero_std": 1.0, "grad_norm": 0.055066518358877194, "kl": 0.0797119140625, "learning_rate": 6.484375000000001e-06, "loss": 0.0008, "num_tokens": 9681971.0, "reward": 1.25, "reward_std": 0.0, "rewards/accuracy_reward/mean": 1.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 84 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 617.0, "completions/max_terminated_length": 617.0, "completions/mean_length": 288.765625, "completions/mean_terminated_length": 288.765625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.03325671247615787, "frac_reward_zero_std": 0.9375, "grad_norm": 0.18202412725307682, "kl": 0.07745361328125, "learning_rate": 6.5625e-06, "loss": 0.004, "num_tokens": 9785511.0, "reward": 1.24609375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.99609375, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 85 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 551.0, "completions/max_terminated_length": 551.0, "completions/mean_length": 301.06640625, "completions/mean_terminated_length": 301.06640625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.03364796791705384, "frac_reward_zero_std": 0.8125, "grad_norm": 0.25205888242493957, "kl": 0.08465576171875, "learning_rate": 6.6406250000000005e-06, "loss": -0.0023, "num_tokens": 9893112.0, "reward": 1.1484375, "reward_std": 0.07206955552101135, "rewards/accuracy_reward/mean": 0.8984375, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 86 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 357.19140625, "completions/mean_terminated_length": 357.19140625, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.03403922335794982, "frac_reward_zero_std": 0.875, "grad_norm": 0.6038793573190792, "kl": 0.12213134765625, "learning_rate": 6.718750000000001e-06, "loss": 0.0013, "num_tokens": 10012153.0, "reward": 1.13671875, "reward_std": 0.051267411559820175, "rewards/accuracy_reward/mean": 0.88671875, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 87 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 735.0, "completions/max_terminated_length": 735.0, "completions/mean_length": 336.21875, "completions/mean_terminated_length": 336.21875, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.034430478798845796, "frac_reward_zero_std": 0.8125, "grad_norm": 0.239553743834147, "kl": 0.064239501953125, "learning_rate": 6.796875000000001e-06, "loss": 0.0145, "num_tokens": 10128033.0, "reward": 1.15234375, "reward_std": 0.06327171623706818, "rewards/accuracy_reward/mean": 0.90234375, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 88 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 784.0, "completions/max_terminated_length": 784.0, "completions/mean_length": 366.0234375, "completions/mean_terminated_length": 366.0234375, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.03482173423974177, "frac_reward_zero_std": 0.625, "grad_norm": 0.3596195244415184, "kl": 0.0692138671875, "learning_rate": 6.875e-06, "loss": 0.0083, "num_tokens": 10251879.0, "reward": 1.06640625, "reward_std": 0.13039018213748932, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 89 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 771.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 357.33984375, "completions/mean_terminated_length": 357.33984375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.03521298968063775, "frac_reward_zero_std": 0.6875, "grad_norm": 0.29813084014291796, "kl": 0.063385009765625, "learning_rate": 6.9531250000000004e-06, "loss": 0.0141, "num_tokens": 10373198.0, "reward": 1.0390625, "reward_std": 0.1048629954457283, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 90 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 676.0, "completions/max_terminated_length": 676.0, "completions/mean_length": 346.953125, "completions/mean_terminated_length": 346.953125, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.03560424512153372, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3028084073612818, "kl": 0.0682373046875, "learning_rate": 7.031250000000001e-06, "loss": -0.0103, "num_tokens": 10491986.0, "reward": 1.140625, "reward_std": 0.09477485716342926, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 91 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 325.8515625, "completions/mean_terminated_length": 325.8515625, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.0359955005624297, "frac_reward_zero_std": 0.6875, "grad_norm": 0.33052544771435843, "kl": 0.0943603515625, "learning_rate": 7.109375000000001e-06, "loss": 0.0202, "num_tokens": 10604748.0, "reward": 1.23388671875, "reward_std": 0.064453125, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 92 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 344.84765625, "completions/mean_terminated_length": 344.84765625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.03638675600332567, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2307975359923657, "kl": 0.078125, "learning_rate": 7.1875e-06, "loss": 0.0071, "num_tokens": 10721717.0, "reward": 1.1517333984375, "reward_std": 0.09620161354541779, "rewards/accuracy_reward/mean": 0.90234375, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 93 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 354.4609375, "completions/mean_terminated_length": 354.4609375, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.036778011444221645, "frac_reward_zero_std": 0.625, "grad_norm": 0.3075211654184873, "kl": 0.07550048828125, "learning_rate": 7.265625e-06, "loss": 0.0101, "num_tokens": 10842939.0, "reward": 1.00732421875, "reward_std": 0.11956727504730225, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 94 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 503.0, "completions/max_terminated_length": 503.0, "completions/mean_length": 307.1796875, "completions/mean_terminated_length": 307.1796875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.037169266885117624, "frac_reward_zero_std": 1.0, "grad_norm": 0.02438320938851383, "kl": 0.08099365234375, "learning_rate": 7.343750000000001e-06, "loss": 0.0008, "num_tokens": 10951081.0, "reward": 1.125, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 95 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 634.0, "completions/max_terminated_length": 634.0, "completions/mean_length": 341.70703125, "completions/mean_terminated_length": 341.70703125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.0375605223260136, "frac_reward_zero_std": 0.875, "grad_norm": 0.13431298624624705, "kl": 0.0740966796875, "learning_rate": 7.421875000000001e-06, "loss": -0.0049, "num_tokens": 11067710.0, "reward": 1.12060546875, "reward_std": 0.017578125, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 96 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 288.41796875, "completions/mean_terminated_length": 288.41796875, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.03795177776690957, "frac_reward_zero_std": 0.625, "grad_norm": 0.2963321432649285, "kl": 0.09893798828125, "learning_rate": 7.500000000000001e-06, "loss": 0.0149, "num_tokens": 11170089.0, "reward": 1.2288818359375, "reward_std": 0.07307660579681396, "rewards/accuracy_reward/mean": 0.98046875, "rewards/accuracy_reward/std": 0.13865381479263306, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 97 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 651.0, "completions/max_terminated_length": 651.0, "completions/mean_length": 291.03515625, "completions/mean_terminated_length": 291.03515625, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.03834303320780555, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2222055633171999, "kl": 0.10791015625, "learning_rate": 7.578125e-06, "loss": -0.0051, "num_tokens": 11273314.0, "reward": 1.0029296875, "reward_std": 0.057051654905080795, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0625, "step": 98 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 865.0, "completions/max_terminated_length": 865.0, "completions/mean_length": 323.3046875, "completions/mean_terminated_length": 323.3046875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.03873428864870152, "frac_reward_zero_std": 0.6875, "grad_norm": 0.310322050429281, "kl": 0.09112548828125, "learning_rate": 7.656250000000001e-06, "loss": 0.0332, "num_tokens": 11386096.0, "reward": 1.198486328125, "reward_std": 0.078780397772789, "rewards/accuracy_reward/mean": 0.94921875, "rewards/accuracy_reward/std": 0.21998079121112823, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 99 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 797.0, "completions/max_terminated_length": 797.0, "completions/mean_length": 316.65234375, "completions/mean_terminated_length": 316.65234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.039125544089597494, "frac_reward_zero_std": 0.8125, "grad_norm": 0.21436669478092463, "kl": 0.09307861328125, "learning_rate": 7.734375e-06, "loss": 0.0072, "num_tokens": 11495095.0, "reward": 1.1170654296875, "reward_std": 0.03173828125, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 829.0, "completions/max_terminated_length": 829.0, "completions/mean_length": 378.62890625, "completions/mean_terminated_length": 378.62890625, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.039516799530493474, "frac_reward_zero_std": 0.875, "grad_norm": 0.14026979970815118, "kl": 0.07415771484375, "learning_rate": 7.8125e-06, "loss": 0.0027, "num_tokens": 11622856.0, "reward": 1.07421875, "reward_std": 0.051267411559820175, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 798.0, "completions/max_terminated_length": 798.0, "completions/mean_length": 363.67578125, "completions/mean_terminated_length": 363.67578125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.039908054971389446, "frac_reward_zero_std": 0.75, "grad_norm": 0.21945312565464492, "kl": 0.077392578125, "learning_rate": 7.890625e-06, "loss": 0.0075, "num_tokens": 11745653.0, "reward": 1.07421875, "reward_std": 0.09122256934642792, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 914.0, "completions/max_terminated_length": 914.0, "completions/mean_length": 375.1953125, "completions/mean_terminated_length": 375.1953125, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.04029931041228542, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1876202321314651, "kl": 0.08660888671875, "learning_rate": 7.96875e-06, "loss": 0.0038, "num_tokens": 11870551.0, "reward": 1.17578125, "reward_std": 0.046875, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 877.0, "completions/max_terminated_length": 877.0, "completions/mean_length": 391.94921875, "completions/mean_terminated_length": 391.94921875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.0406905658531814, "frac_reward_zero_std": 0.875, "grad_norm": 0.12739335428788742, "kl": 0.0723876953125, "learning_rate": 8.046875e-06, "loss": -0.0028, "num_tokens": 12001370.0, "reward": 1.15234375, "reward_std": 0.04789986088871956, "rewards/accuracy_reward/mean": 0.90234375, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 887.0, "completions/max_terminated_length": 887.0, "completions/mean_length": 416.328125, "completions/mean_terminated_length": 416.328125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.04108182129407737, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2373941491234095, "kl": 0.06781005859375, "learning_rate": 8.125000000000001e-06, "loss": -0.0013, "num_tokens": 12139342.0, "reward": 1.206298828125, "reward_std": 0.09501665830612183, "rewards/accuracy_reward/mean": 0.95703125, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 820.0, "completions/max_terminated_length": 820.0, "completions/mean_length": 364.21484375, "completions/mean_terminated_length": 364.21484375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.041473076734973344, "frac_reward_zero_std": 0.75, "grad_norm": 0.2583586858275147, "kl": 0.08489990234375, "learning_rate": 8.203125000000001e-06, "loss": 0.0207, "num_tokens": 12263445.0, "reward": 1.234375, "reward_std": 0.0625, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 443.03515625, "completions/mean_terminated_length": 443.03515625, "completions/min_length": 192.0, "completions/min_terminated_length": 192.0, "epoch": 0.04186433217586932, "frac_reward_zero_std": 0.6875, "grad_norm": 0.23286374653111874, "kl": 0.0689697265625, "learning_rate": 8.281250000000001e-06, "loss": 0.0094, "num_tokens": 12407134.0, "reward": 1.0384521484375, "reward_std": 0.07451096177101135, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 924.0, "completions/max_terminated_length": 924.0, "completions/mean_length": 368.33984375, "completions/mean_terminated_length": 368.33984375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.042255587616765296, "frac_reward_zero_std": 0.875, "grad_norm": 0.16604300947951203, "kl": 0.08551025390625, "learning_rate": 8.359375e-06, "loss": -0.0053, "num_tokens": 12530709.0, "reward": 1.119873046875, "reward_std": 0.01896059513092041, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 413.74609375, "completions/mean_terminated_length": 413.74609375, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.04264684305766127, "frac_reward_zero_std": 0.625, "grad_norm": 0.28231039284761755, "kl": 0.0753173828125, "learning_rate": 8.4375e-06, "loss": 0.0067, "num_tokens": 12665348.0, "reward": 1.21630859375, "reward_std": 0.07071840763092041, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 817.0, "completions/max_terminated_length": 817.0, "completions/mean_length": 387.64453125, "completions/mean_terminated_length": 387.64453125, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.04303809849855725, "frac_reward_zero_std": 0.875, "grad_norm": 0.14954400783640737, "kl": 0.083984375, "learning_rate": 8.515625e-06, "loss": 0.0001, "num_tokens": 12794713.0, "reward": 1.17578125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 856.0, "completions/max_terminated_length": 856.0, "completions/mean_length": 437.0859375, "completions/mean_terminated_length": 437.0859375, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.04342935393945322, "frac_reward_zero_std": 0.8125, "grad_norm": 0.1719568210158983, "kl": 0.0826416015625, "learning_rate": 8.59375e-06, "loss": 0.0032, "num_tokens": 12939215.0, "reward": 1.1484375, "reward_std": 0.07833996415138245, "rewards/accuracy_reward/mean": 0.8984375, "rewards/accuracy_reward/std": 0.3026638329029083, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 442.30078125, "completions/mean_terminated_length": 442.30078125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.04382060938034919, "frac_reward_zero_std": 0.6875, "grad_norm": 0.22780031785879812, "kl": 0.088134765625, "learning_rate": 8.671875e-06, "loss": 0.0002, "num_tokens": 13081052.0, "reward": 1.10546875, "reward_std": 0.078125, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 898.0, "completions/max_terminated_length": 898.0, "completions/mean_length": 432.44921875, "completions/mean_terminated_length": 432.44921875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.04421186482124517, "frac_reward_zero_std": 0.875, "grad_norm": 0.13728010345327393, "kl": 0.08721923828125, "learning_rate": 8.750000000000001e-06, "loss": 0.0028, "num_tokens": 13220431.0, "reward": 1.02734375, "reward_std": 0.04789986088871956, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1483.0, "completions/max_terminated_length": 1483.0, "completions/mean_length": 475.078125, "completions/mean_terminated_length": 475.078125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.044603120262141145, "frac_reward_zero_std": 0.625, "grad_norm": 0.21782373139768163, "kl": 0.081298828125, "learning_rate": 8.828125000000001e-06, "loss": 0.0032, "num_tokens": 13373731.0, "reward": 1.05859375, "reward_std": 0.12679657340049744, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 757.0, "completions/max_terminated_length": 757.0, "completions/mean_length": 392.1484375, "completions/mean_terminated_length": 392.1484375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.04499437570303712, "frac_reward_zero_std": 0.9375, "grad_norm": 0.10759084919723048, "kl": 0.10736083984375, "learning_rate": 8.906250000000001e-06, "loss": 0.0028, "num_tokens": 13504425.0, "reward": 1.18359375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.93359375, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 733.0, "completions/max_terminated_length": 733.0, "completions/mean_length": 406.8046875, "completions/mean_terminated_length": 406.8046875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.0453856311439331, "frac_reward_zero_std": 0.875, "grad_norm": 0.19412580626685516, "kl": 0.10296630859375, "learning_rate": 8.984375000000002e-06, "loss": 0.0072, "num_tokens": 13637687.0, "reward": 1.17578125, "reward_std": 0.03697281330823898, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 406.4765625, "completions/mean_terminated_length": 406.4765625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.04577688658482907, "frac_reward_zero_std": 0.75, "grad_norm": 0.24962660268347928, "kl": 0.11083984375, "learning_rate": 9.0625e-06, "loss": 0.0142, "num_tokens": 13771569.0, "reward": 1.17578125, "reward_std": 0.08736192435026169, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 993.0, "completions/mean_length": 409.20703125, "completions/mean_terminated_length": 402.7804260253906, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.04616814202572504, "frac_reward_zero_std": 0.625, "grad_norm": 0.269571996434127, "kl": 0.11517333984375, "learning_rate": 9.140625e-06, "loss": 0.0546, "num_tokens": 13905014.0, "reward": 1.1439208984375, "reward_std": 0.11115114390850067, "rewards/accuracy_reward/mean": 0.89453125, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 890.0, "completions/mean_length": 378.35546875, "completions/mean_terminated_length": 371.807861328125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.04655939746662102, "frac_reward_zero_std": 0.75, "grad_norm": 0.39223980601759006, "kl": 0.14520263671875, "learning_rate": 9.21875e-06, "loss": 0.0636, "num_tokens": 14031825.0, "reward": 1.1180419921875, "reward_std": 0.02444446086883545, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 447.2578125, "completions/mean_terminated_length": 447.2578125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.046950652907516995, "frac_reward_zero_std": 0.5625, "grad_norm": 0.32664991571353663, "kl": 0.1070556640625, "learning_rate": 9.296875e-06, "loss": 0.0133, "num_tokens": 14176723.0, "reward": 1.1728515625, "reward_std": 0.05704653263092041, "rewards/accuracy_reward/mean": 0.92578125, "rewards/accuracy_reward/std": 0.2626400291919708, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1171.0, "completions/max_terminated_length": 1171.0, "completions/mean_length": 373.89453125, "completions/mean_terminated_length": 373.89453125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.04734190834841297, "frac_reward_zero_std": 0.8125, "grad_norm": 0.18936481392620383, "kl": 0.1041259765625, "learning_rate": 9.375000000000001e-06, "loss": -0.001, "num_tokens": 14300744.0, "reward": 1.10546875, "reward_std": 0.05644455552101135, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1089.0, "completions/max_terminated_length": 1089.0, "completions/mean_length": 436.10546875, "completions/mean_terminated_length": 436.10546875, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.04773316378930895, "frac_reward_zero_std": 0.5, "grad_norm": 0.28938181501993904, "kl": 0.08978271484375, "learning_rate": 9.453125000000001e-06, "loss": 0.0165, "num_tokens": 14444419.0, "reward": 1.1829833984375, "reward_std": 0.143758624792099, "rewards/accuracy_reward/mean": 0.93359375, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 364.70703125, "completions/mean_terminated_length": 364.70703125, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.04812441923020492, "frac_reward_zero_std": 0.625, "grad_norm": 0.308057208003193, "kl": 0.1265869140625, "learning_rate": 9.531250000000001e-06, "loss": 0.0089, "num_tokens": 14567064.0, "reward": 1.178466796875, "reward_std": 0.1251467913389206, "rewards/accuracy_reward/mean": 0.9296875, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 773.0, "completions/max_terminated_length": 773.0, "completions/mean_length": 382.51953125, "completions/mean_terminated_length": 382.51953125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.04851567467110089, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2490682484953628, "kl": 0.1060791015625, "learning_rate": 9.609375000000001e-06, "loss": 0.007, "num_tokens": 14693277.0, "reward": 1.2103271484375, "reward_std": 0.09817036241292953, "rewards/accuracy_reward/mean": 0.9609375, "rewards/accuracy_reward/std": 0.19412322342395782, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1041.0, "completions/max_terminated_length": 1041.0, "completions/mean_length": 358.21484375, "completions/mean_terminated_length": 358.21484375, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.04890693011199687, "frac_reward_zero_std": 0.625, "grad_norm": 0.27386179186366993, "kl": 0.112548828125, "learning_rate": 9.6875e-06, "loss": 0.0147, "num_tokens": 14814116.0, "reward": 1.1907958984375, "reward_std": 0.11577065289020538, "rewards/accuracy_reward/mean": 0.94140625, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 937.0, "completions/max_terminated_length": 937.0, "completions/mean_length": 430.42578125, "completions/mean_terminated_length": 430.42578125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.049298185552892844, "frac_reward_zero_std": 0.75, "grad_norm": 0.19554460203812724, "kl": 0.09820556640625, "learning_rate": 9.765625e-06, "loss": 0.0143, "num_tokens": 14956097.0, "reward": 1.15625, "reward_std": 0.08351518213748932, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 431.7890625, "completions/mean_terminated_length": 431.7890625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.049689440993788817, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2527266519131744, "kl": 0.09918212890625, "learning_rate": 9.84375e-06, "loss": -0.0056, "num_tokens": 15097355.0, "reward": 1.0997314453125, "reward_std": 0.07939377427101135, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 850.0, "completions/max_terminated_length": 850.0, "completions/mean_length": 371.71875, "completions/mean_terminated_length": 371.71875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.050080696434684796, "frac_reward_zero_std": 0.875, "grad_norm": 0.209193946151249, "kl": 0.11175537109375, "learning_rate": 9.921875e-06, "loss": 0.0148, "num_tokens": 15221987.0, "reward": 1.2454833984375, "reward_std": 0.01806640625, "rewards/accuracy_reward/mean": 0.99609375, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 756.0, "completions/max_terminated_length": 756.0, "completions/mean_length": 374.33203125, "completions/mean_terminated_length": 374.33203125, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.05047195187558077, "frac_reward_zero_std": 0.8125, "grad_norm": 0.17329290900899288, "kl": 0.10382080078125, "learning_rate": 1e-05, "loss": 0.0089, "num_tokens": 15345160.0, "reward": 1.0423583984375, "reward_std": 0.04601725563406944, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 962.0, "completions/max_terminated_length": 962.0, "completions/mean_length": 377.3046875, "completions/mean_terminated_length": 377.3046875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05086320731647675, "frac_reward_zero_std": 0.8125, "grad_norm": 0.25512014511187614, "kl": 0.1026611328125, "learning_rate": 1.0078125000000001e-05, "loss": 0.0187, "num_tokens": 15470534.0, "reward": 1.167236328125, "reward_std": 0.05937424302101135, "rewards/accuracy_reward/mean": 0.91796875, "rewards/accuracy_reward/std": 0.2749498784542084, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1067.0, "completions/mean_length": 412.5, "completions/mean_terminated_length": 406.0863037109375, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.05125446275737272, "frac_reward_zero_std": 0.75, "grad_norm": 0.29847980631081095, "kl": 0.0921630859375, "learning_rate": 1.0156250000000001e-05, "loss": 0.0526, "num_tokens": 15605958.0, "reward": 1.2381591796875, "reward_std": 0.04736328125, "rewards/accuracy_reward/mean": 0.98828125, "rewards/accuracy_reward/std": 0.1078278198838234, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 434.5625, "completions/mean_terminated_length": 434.5625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.05164571819826869, "frac_reward_zero_std": 0.875, "grad_norm": 0.17985569557042155, "kl": 0.0902099609375, "learning_rate": 1.0234375000000001e-05, "loss": 0.0158, "num_tokens": 15746966.0, "reward": 1.2457275390625, "reward_std": 0.01708984375, "rewards/accuracy_reward/mean": 0.99609375, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 515.015625, "completions/mean_terminated_length": 515.015625, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.05203697363916467, "frac_reward_zero_std": 0.625, "grad_norm": 0.2515211416419203, "kl": 0.07958984375, "learning_rate": 1.0312500000000002e-05, "loss": 0.0281, "num_tokens": 15910170.0, "reward": 1.049560546875, "reward_std": 0.11293761432170868, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 399.21484375, "completions/mean_terminated_length": 399.21484375, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.052428229080060645, "frac_reward_zero_std": 0.6875, "grad_norm": 0.32859600910754616, "kl": 0.0965576171875, "learning_rate": 1.0390625e-05, "loss": 0.0007, "num_tokens": 16043617.0, "reward": 1.05078125, "reward_std": 0.12184548377990723, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 893.0, "completions/max_terminated_length": 893.0, "completions/mean_length": 378.2734375, "completions/mean_terminated_length": 378.2734375, "completions/min_length": 120.0, "completions/min_terminated_length": 120.0, "epoch": 0.05281948452095662, "frac_reward_zero_std": 0.75, "grad_norm": 0.24192653807344616, "kl": 0.09002685546875, "learning_rate": 1.046875e-05, "loss": 0.0076, "num_tokens": 16170743.0, "reward": 1.15234375, "reward_std": 0.0892379954457283, "rewards/accuracy_reward/mean": 0.90234375, "rewards/accuracy_reward/std": 0.29743078351020813, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1227.0, "completions/max_terminated_length": 1227.0, "completions/mean_length": 379.84375, "completions/mean_terminated_length": 379.84375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.0532107399618526, "frac_reward_zero_std": 0.8125, "grad_norm": 0.25798674991822784, "kl": 0.09759521484375, "learning_rate": 1.0546875e-05, "loss": 0.014, "num_tokens": 16296783.0, "reward": 1.16015625, "reward_std": 0.06492365896701813, "rewards/accuracy_reward/mean": 0.91015625, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 675.0, "completions/max_terminated_length": 675.0, "completions/mean_length": 376.31640625, "completions/mean_terminated_length": 376.31640625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.05360199540274857, "frac_reward_zero_std": 0.5625, "grad_norm": 0.28177493298587275, "kl": 0.09490966796875, "learning_rate": 1.0625e-05, "loss": 0.0082, "num_tokens": 16425392.0, "reward": 1.1126708984375, "reward_std": 0.12656591832637787, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 915.0, "completions/max_terminated_length": 915.0, "completions/mean_length": 370.32421875, "completions/mean_terminated_length": 370.32421875, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.05399325084364454, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3164078656639367, "kl": 0.09527587890625, "learning_rate": 1.0703125000000001e-05, "loss": 0.0198, "num_tokens": 16548035.0, "reward": 1.1595458984375, "reward_std": 0.10191421955823898, "rewards/accuracy_reward/mean": 0.91015625, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1271.0, "completions/max_terminated_length": 1271.0, "completions/mean_length": 357.6953125, "completions/mean_terminated_length": 357.6953125, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.05438450628454052, "frac_reward_zero_std": 0.6875, "grad_norm": 0.26971077264757604, "kl": 0.0927734375, "learning_rate": 1.0781250000000001e-05, "loss": 0.0259, "num_tokens": 16668037.0, "reward": 1.05810546875, "reward_std": 0.10949081182479858, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 329.640625, "completions/mean_terminated_length": 329.640625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.054775761725436495, "frac_reward_zero_std": 0.75, "grad_norm": 0.22100374565564945, "kl": 0.10009765625, "learning_rate": 1.0859375000000001e-05, "loss": 0.0015, "num_tokens": 16782409.0, "reward": 1.21875, "reward_std": 0.0767945945262909, "rewards/accuracy_reward/mean": 0.96875, "rewards/accuracy_reward/std": 0.17433346807956696, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 686.0, "completions/max_terminated_length": 686.0, "completions/mean_length": 322.34375, "completions/mean_terminated_length": 322.34375, "completions/min_length": 124.0, "completions/min_terminated_length": 124.0, "epoch": 0.05516701716633247, "frac_reward_zero_std": 0.9375, "grad_norm": 0.11171619915668742, "kl": 0.1015625, "learning_rate": 1.0937500000000002e-05, "loss": 0.0026, "num_tokens": 16894897.0, "reward": 1.18359375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.93359375, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1129.0, "completions/max_terminated_length": 1129.0, "completions/mean_length": 384.87109375, "completions/mean_terminated_length": 384.87109375, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.05555827260722845, "frac_reward_zero_std": 0.75, "grad_norm": 0.2335279663644235, "kl": 0.09820556640625, "learning_rate": 1.1015625e-05, "loss": 0.0286, "num_tokens": 17025008.0, "reward": 1.0703125, "reward_std": 0.091475710272789, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 922.0, "completions/max_terminated_length": 922.0, "completions/mean_length": 291.125, "completions/mean_terminated_length": 291.125, "completions/min_length": 105.0, "completions/min_terminated_length": 105.0, "epoch": 0.05594952804812442, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2627971666326669, "kl": 0.1260986328125, "learning_rate": 1.109375e-05, "loss": 0.0045, "num_tokens": 17128784.0, "reward": 1.0390625, "reward_std": 0.05920084938406944, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 252.83203125, "completions/mean_terminated_length": 252.83203125, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.05634078348902039, "frac_reward_zero_std": 0.875, "grad_norm": 0.23031632649164296, "kl": 0.13385009765625, "learning_rate": 1.1171875e-05, "loss": 0.0192, "num_tokens": 17222165.0, "reward": 1.2421875, "reward_std": 0.03125, "rewards/accuracy_reward/mean": 0.9921875, "rewards/accuracy_reward/std": 0.08821486681699753, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1152.0, "completions/max_terminated_length": 1152.0, "completions/mean_length": 269.3046875, "completions/mean_terminated_length": 269.3046875, "completions/min_length": 67.0, "completions/min_terminated_length": 67.0, "epoch": 0.05673203892991637, "frac_reward_zero_std": 0.75, "grad_norm": 0.29950565362698334, "kl": 0.1365966796875, "learning_rate": 1.125e-05, "loss": 0.0372, "num_tokens": 17320755.0, "reward": 1.21484375, "reward_std": 0.0892379954457283, "rewards/accuracy_reward/mean": 0.96484375, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 889.0, "completions/max_terminated_length": 889.0, "completions/mean_length": 277.57421875, "completions/mean_terminated_length": 277.57421875, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.057123294370812344, "frac_reward_zero_std": 0.75, "grad_norm": 0.34639809769491986, "kl": 0.12908935546875, "learning_rate": 1.1328125000000001e-05, "loss": 0.0106, "num_tokens": 17420902.0, "reward": 1.19140625, "reward_std": 0.07482585310935974, "rewards/accuracy_reward/mean": 0.94140625, "rewards/accuracy_reward/std": 0.23532284796237946, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 672.0, "completions/max_terminated_length": 672.0, "completions/mean_length": 296.71875, "completions/mean_terminated_length": 296.71875, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.05751454981170832, "frac_reward_zero_std": 0.75, "grad_norm": 0.31116668964821936, "kl": 0.119140625, "learning_rate": 1.1406250000000001e-05, "loss": 0.0192, "num_tokens": 17528398.0, "reward": 1.03515625, "reward_std": 0.07779236882925034, "rewards/accuracy_reward/mean": 0.78515625, "rewards/accuracy_reward/std": 0.4115184545516968, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 700.0, "completions/max_terminated_length": 700.0, "completions/mean_length": 266.234375, "completions/mean_terminated_length": 266.234375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.057905805252604296, "frac_reward_zero_std": 0.9375, "grad_norm": 0.15709844797651784, "kl": 0.1424560546875, "learning_rate": 1.1484375000000001e-05, "loss": 0.004, "num_tokens": 17625978.0, "reward": 1.18359375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.93359375, "rewards/accuracy_reward/std": 0.24947863817214966, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 616.0, "completions/max_terminated_length": 616.0, "completions/mean_length": 260.2734375, "completions/mean_terminated_length": 260.2734375, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.05829706069350027, "frac_reward_zero_std": 0.9375, "grad_norm": 0.13597427848409235, "kl": 0.14111328125, "learning_rate": 1.1562500000000002e-05, "loss": -0.0013, "num_tokens": 17722608.0, "reward": 1.12109375, "reward_std": 0.015625, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 208.703125, "completions/mean_terminated_length": 208.703125, "completions/min_length": 68.0, "completions/min_terminated_length": 68.0, "epoch": 0.05868831613439624, "frac_reward_zero_std": 0.75, "grad_norm": 0.40395651722247866, "kl": 0.1866455078125, "learning_rate": 1.1640625000000002e-05, "loss": 0.0041, "num_tokens": 17804788.0, "reward": 1.14453125, "reward_std": 0.09287451207637787, "rewards/accuracy_reward/mean": 0.89453125, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 752.0, "completions/max_terminated_length": 752.0, "completions/mean_length": 275.18359375, "completions/mean_terminated_length": 275.18359375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.05907957157529222, "frac_reward_zero_std": 0.8125, "grad_norm": 0.18838121405472785, "kl": 0.14404296875, "learning_rate": 1.171875e-05, "loss": -0.0004, "num_tokens": 17906931.0, "reward": 1.20703125, "reward_std": 0.07482585310935974, "rewards/accuracy_reward/mean": 0.95703125, "rewards/accuracy_reward/std": 0.20318391919136047, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 776.0, "completions/max_terminated_length": 776.0, "completions/mean_length": 311.7578125, "completions/mean_terminated_length": 311.7578125, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.059470827016188194, "frac_reward_zero_std": 0.5, "grad_norm": 0.3311453652330293, "kl": 0.13812255859375, "learning_rate": 1.1796875e-05, "loss": 0.0107, "num_tokens": 18018997.0, "reward": 1.0267333984375, "reward_std": 0.15702837705612183, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 298.82421875, "completions/mean_terminated_length": 285.0511779785156, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.059862082457084166, "frac_reward_zero_std": 0.4375, "grad_norm": 0.5900840451709776, "kl": 0.1798095703125, "learning_rate": 1.1875e-05, "loss": 0.0718, "num_tokens": 18122840.0, "reward": 1.0216064453125, "reward_std": 0.17017348110675812, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1030.0, "completions/max_terminated_length": 1030.0, "completions/mean_length": 322.08984375, "completions/mean_terminated_length": 322.08984375, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.060253337897980146, "frac_reward_zero_std": 0.625, "grad_norm": 0.40699074816735903, "kl": 0.2620849609375, "learning_rate": 1.1953125000000001e-05, "loss": 0.0054, "num_tokens": 18236415.0, "reward": 1.0625, "reward_std": 0.15461406111717224, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 335.0703125, "completions/mean_terminated_length": 335.0703125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.06064459333887612, "frac_reward_zero_std": 0.9375, "grad_norm": 0.13957010305610074, "kl": 0.2196044921875, "learning_rate": 1.2031250000000001e-05, "loss": 0.0167, "num_tokens": 18352785.0, "reward": 1.1796875, "reward_std": 0.021347813308238983, "rewards/accuracy_reward/mean": 0.9296875, "rewards/accuracy_reward/std": 0.2561737895011902, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1462.0, "completions/max_terminated_length": 1462.0, "completions/mean_length": 390.68359375, "completions/mean_terminated_length": 390.68359375, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.06103584877977209, "frac_reward_zero_std": 0.625, "grad_norm": 0.3724471075915449, "kl": 0.2049560546875, "learning_rate": 1.2109375000000001e-05, "loss": 0.0273, "num_tokens": 18484752.0, "reward": 1.04296875, "reward_std": 0.11091843992471695, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1082.0, "completions/max_terminated_length": 1082.0, "completions/mean_length": 397.72265625, "completions/mean_terminated_length": 397.72265625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.06142710422066807, "frac_reward_zero_std": 0.375, "grad_norm": 0.4306138940190092, "kl": 0.23388671875, "learning_rate": 1.2187500000000001e-05, "loss": 0.0327, "num_tokens": 18614297.0, "reward": 0.947509765625, "reward_std": 0.1394595056772232, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0539139099419117, "step": 157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 425.1171875, "completions/mean_terminated_length": 425.1171875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.06181835966156404, "frac_reward_zero_std": 0.5625, "grad_norm": 0.35838255057660584, "kl": 0.2088623046875, "learning_rate": 1.2265625000000002e-05, "loss": 0.0087, "num_tokens": 18751543.0, "reward": 1.05859375, "reward_std": 0.14547231793403625, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1485.0, "completions/mean_length": 530.74609375, "completions/mean_terminated_length": 524.7960815429688, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.062209615102460016, "frac_reward_zero_std": 0.625, "grad_norm": 0.2502134252301436, "kl": 0.1990966796875, "learning_rate": 1.234375e-05, "loss": 0.0329, "num_tokens": 18918278.0, "reward": 1.0814208984375, "reward_std": 0.15322360396385193, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1746.0, "completions/max_terminated_length": 1746.0, "completions/mean_length": 500.76953125, "completions/mean_terminated_length": 500.76953125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.06260087054335599, "frac_reward_zero_std": 0.6875, "grad_norm": 0.25379918710439947, "kl": 0.18359375, "learning_rate": 1.2421875e-05, "loss": 0.044, "num_tokens": 19076075.0, "reward": 0.8984375, "reward_std": 0.12654343247413635, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1196.0, "completions/max_terminated_length": 1196.0, "completions/mean_length": 508.58203125, "completions/mean_terminated_length": 508.58203125, "completions/min_length": 222.0, "completions/min_terminated_length": 222.0, "epoch": 0.06299212598425197, "frac_reward_zero_std": 0.5625, "grad_norm": 0.25958313027536783, "kl": 0.176513671875, "learning_rate": 1.25e-05, "loss": 0.0143, "num_tokens": 19234768.0, "reward": 0.921875, "reward_std": 0.2017040103673935, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1420.0, "completions/mean_length": 474.59765625, "completions/mean_terminated_length": 468.427490234375, "completions/min_length": 166.0, "completions/min_terminated_length": 166.0, "epoch": 0.06338338142514795, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2847872535110348, "kl": 0.2000732421875, "learning_rate": 1.2578125e-05, "loss": 0.0588, "num_tokens": 19385065.0, "reward": 1.0772705078125, "reward_std": 0.15573075413703918, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 383.4296875, "completions/mean_terminated_length": 383.4296875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.06377463686604391, "frac_reward_zero_std": 0.75, "grad_norm": 0.21608229113735827, "kl": 0.2005615234375, "learning_rate": 1.2656250000000001e-05, "loss": 0.018, "num_tokens": 19511655.0, "reward": 1.07421875, "reward_std": 0.10671419650316238, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1421.0, "completions/max_terminated_length": 1421.0, "completions/mean_length": 483.91796875, "completions/mean_terminated_length": 483.91796875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.06416589230693989, "frac_reward_zero_std": 0.625, "grad_norm": 0.2267780736464419, "kl": 0.166748046875, "learning_rate": 1.2734375000000001e-05, "loss": 0.0069, "num_tokens": 19665730.0, "reward": 1.1556396484375, "reward_std": 0.12403374910354614, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 401.44921875, "completions/mean_terminated_length": 401.44921875, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.06455714774783587, "frac_reward_zero_std": 0.75, "grad_norm": 0.27636969593179767, "kl": 0.1873779296875, "learning_rate": 1.2812500000000001e-05, "loss": 0.0273, "num_tokens": 19799685.0, "reward": 1.10546875, "reward_std": 0.10049767792224884, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 881.0, "completions/max_terminated_length": 881.0, "completions/mean_length": 350.546875, "completions/mean_terminated_length": 350.546875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.06494840318873184, "frac_reward_zero_std": 0.3125, "grad_norm": 0.38321897490510604, "kl": 0.1883544921875, "learning_rate": 1.2890625000000002e-05, "loss": 0.0151, "num_tokens": 19919121.0, "reward": 0.96875, "reward_std": 0.299363374710083, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 376.05859375, "completions/mean_terminated_length": 376.05859375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.06533965862962782, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3762722552482031, "kl": 0.1669921875, "learning_rate": 1.2968750000000002e-05, "loss": 0.024, "num_tokens": 20045136.0, "reward": 1.046875, "reward_std": 0.23505613207817078, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 326.546875, "completions/mean_terminated_length": 326.546875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.0657309140705238, "frac_reward_zero_std": 0.625, "grad_norm": 0.326038562893061, "kl": 0.1727294921875, "learning_rate": 1.3046875e-05, "loss": 0.0463, "num_tokens": 20158460.0, "reward": 1.1165771484375, "reward_std": 0.13403882086277008, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 325.0859375, "completions/mean_terminated_length": 325.0859375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06612216951141976, "frac_reward_zero_std": 0.4375, "grad_norm": 1.1077535993318635, "kl": 0.17431640625, "learning_rate": 1.3125e-05, "loss": 0.0348, "num_tokens": 20269650.0, "reward": 1.189697265625, "reward_std": 0.11025739461183548, "rewards/accuracy_reward/mean": 0.9453125, "rewards/accuracy_reward/std": 0.22781464457511902, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.048530805855989456, "step": 169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 939.0, "completions/max_terminated_length": 939.0, "completions/mean_length": 376.66015625, "completions/mean_terminated_length": 376.66015625, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.06651342495231574, "frac_reward_zero_std": 0.625, "grad_norm": 0.3168750225720623, "kl": 0.1527099609375, "learning_rate": 1.3203125e-05, "loss": 0.0278, "num_tokens": 20396715.0, "reward": 1.0450439453125, "reward_std": 0.14939983189105988, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 879.0, "completions/max_terminated_length": 879.0, "completions/mean_length": 337.3203125, "completions/mean_terminated_length": 337.3203125, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.06690468039321172, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2760168919764172, "kl": 0.1673583984375, "learning_rate": 1.3281250000000001e-05, "loss": 0.0182, "num_tokens": 20510253.0, "reward": 1.21484375, "reward_std": 0.07073915004730225, "rewards/accuracy_reward/mean": 0.96484375, "rewards/accuracy_reward/std": 0.18453538417816162, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 795.0, "completions/max_terminated_length": 795.0, "completions/mean_length": 333.96875, "completions/mean_terminated_length": 333.96875, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.06729593583410769, "frac_reward_zero_std": 0.6875, "grad_norm": 0.29604580527493574, "kl": 0.1802978515625, "learning_rate": 1.3359375000000001e-05, "loss": 0.0196, "num_tokens": 20623253.0, "reward": 1.078125, "reward_std": 0.10024453699588776, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 356.171875, "completions/mean_terminated_length": 356.171875, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.06768719127500367, "frac_reward_zero_std": 0.625, "grad_norm": 0.2852234809306463, "kl": 0.1517333984375, "learning_rate": 1.3437500000000001e-05, "loss": 0.0119, "num_tokens": 20744017.0, "reward": 1.12890625, "reward_std": 0.14656084775924683, "rewards/accuracy_reward/mean": 0.87890625, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 681.0, "completions/max_terminated_length": 681.0, "completions/mean_length": 344.20703125, "completions/mean_terminated_length": 344.20703125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.06807844671589965, "frac_reward_zero_std": 0.8125, "grad_norm": 0.18533700815479398, "kl": 0.153564453125, "learning_rate": 1.3515625000000002e-05, "loss": 0.0021, "num_tokens": 20861830.0, "reward": 1.006591796875, "reward_std": 0.026230625808238983, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1842.0, "completions/max_terminated_length": 1842.0, "completions/mean_length": 404.6875, "completions/mean_terminated_length": 404.6875, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.06846970215679561, "frac_reward_zero_std": 0.5, "grad_norm": 0.27974264671426247, "kl": 0.1614990234375, "learning_rate": 1.3593750000000002e-05, "loss": -0.0166, "num_tokens": 20996038.0, "reward": 1.0625, "reward_std": 0.17493700981140137, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 772.0, "completions/max_terminated_length": 772.0, "completions/mean_length": 327.48046875, "completions/mean_terminated_length": 327.48046875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.06886095759769159, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3201481025634699, "kl": 0.207275390625, "learning_rate": 1.3671875e-05, "loss": -0.0066, "num_tokens": 21107873.0, "reward": 1.125, "reward_std": 0.14634782075881958, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 613.0, "completions/max_terminated_length": 613.0, "completions/mean_length": 328.34765625, "completions/mean_terminated_length": 328.34765625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.06925221303858757, "frac_reward_zero_std": 0.625, "grad_norm": 0.27157736796838844, "kl": 0.17333984375, "learning_rate": 1.375e-05, "loss": 0.0125, "num_tokens": 21218842.0, "reward": 1.0546875, "reward_std": 0.13599544763565063, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 667.0, "completions/max_terminated_length": 667.0, "completions/mean_length": 329.1015625, "completions/mean_terminated_length": 329.1015625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.06964346847948354, "frac_reward_zero_std": 0.8125, "grad_norm": 0.19234272224618842, "kl": 0.1864013671875, "learning_rate": 1.3828125e-05, "loss": -0.0003, "num_tokens": 21333124.0, "reward": 1.2337646484375, "reward_std": 0.04326096177101135, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 910.0, "completions/max_terminated_length": 910.0, "completions/mean_length": 382.4765625, "completions/mean_terminated_length": 382.4765625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07003472392037952, "frac_reward_zero_std": 0.6875, "grad_norm": 0.2591850395265174, "kl": 0.1627197265625, "learning_rate": 1.3906250000000001e-05, "loss": 0.0087, "num_tokens": 21459998.0, "reward": 1.14453125, "reward_std": 0.1076192855834961, "rewards/accuracy_reward/mean": 0.89453125, "rewards/accuracy_reward/std": 0.3077581524848938, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1004.0, "completions/max_terminated_length": 1004.0, "completions/mean_length": 379.15234375, "completions/mean_terminated_length": 379.15234375, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.0704259793612755, "frac_reward_zero_std": 0.75, "grad_norm": 0.22341202068552637, "kl": 0.1845703125, "learning_rate": 1.3984375000000001e-05, "loss": 0.0044, "num_tokens": 21585509.0, "reward": 1.046875, "reward_std": 0.11083894968032837, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 382.36328125, "completions/mean_terminated_length": 382.36328125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.07081723480217146, "frac_reward_zero_std": 0.4375, "grad_norm": 0.33540252261854425, "kl": 0.177001953125, "learning_rate": 1.4062500000000001e-05, "loss": 0.0068, "num_tokens": 21713570.0, "reward": 1.125, "reward_std": 0.1978301852941513, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 896.0, "completions/max_terminated_length": 896.0, "completions/mean_length": 402.48828125, "completions/mean_terminated_length": 402.48828125, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.07120849024306744, "frac_reward_zero_std": 0.375, "grad_norm": 0.4417539833421105, "kl": 0.18701171875, "learning_rate": 1.4140625000000002e-05, "loss": 0.0341, "num_tokens": 21849135.0, "reward": 1.1171875, "reward_std": 0.18409234285354614, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 815.0, "completions/max_terminated_length": 815.0, "completions/mean_length": 422.78515625, "completions/mean_terminated_length": 422.78515625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.07159974568396342, "frac_reward_zero_std": 0.5625, "grad_norm": 0.321676116743114, "kl": 0.16748046875, "learning_rate": 1.4218750000000002e-05, "loss": 0.0086, "num_tokens": 21986072.0, "reward": 0.98046875, "reward_std": 0.13699321448802948, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1151.0, "completions/max_terminated_length": 1151.0, "completions/mean_length": 391.08984375, "completions/mean_terminated_length": 391.08984375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0719910011248594, "frac_reward_zero_std": 0.4375, "grad_norm": 0.32436041225763035, "kl": 0.21044921875, "learning_rate": 1.4296875000000002e-05, "loss": 0.0189, "num_tokens": 22116047.0, "reward": 0.87109375, "reward_std": 0.21157720685005188, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 814.0, "completions/max_terminated_length": 814.0, "completions/mean_length": 390.4375, "completions/mean_terminated_length": 390.4375, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.07238225656575537, "frac_reward_zero_std": 0.625, "grad_norm": 0.2775902213303993, "kl": 0.1776123046875, "learning_rate": 1.4375e-05, "loss": 0.0194, "num_tokens": 22246847.0, "reward": 1.108154296875, "reward_std": 0.12642107903957367, "rewards/accuracy_reward/mean": 0.859375, "rewards/accuracy_reward/std": 0.3483152687549591, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1093.0, "completions/max_terminated_length": 1093.0, "completions/mean_length": 355.640625, "completions/mean_terminated_length": 355.640625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.07277351200665134, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3152925590105812, "kl": 0.2032470703125, "learning_rate": 1.4453125e-05, "loss": -0.0018, "num_tokens": 22367411.0, "reward": 1.0390625, "reward_std": 0.1706668734550476, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 725.0, "completions/mean_length": 366.43359375, "completions/mean_terminated_length": 346.49407958984375, "completions/min_length": 96.0, "completions/min_terminated_length": 96.0, "epoch": 0.07316476744754732, "frac_reward_zero_std": 0.3125, "grad_norm": 0.43192127117969903, "kl": 0.2344970703125, "learning_rate": 1.453125e-05, "loss": 0.1434, "num_tokens": 22490786.0, "reward": 1.0802001953125, "reward_std": 0.18831481039524078, "rewards/accuracy_reward/mean": 0.83203125, "rewards/accuracy_reward/std": 0.3745708465576172, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 858.0, "completions/max_terminated_length": 858.0, "completions/mean_length": 343.5546875, "completions/mean_terminated_length": 343.5546875, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.07355602288844329, "frac_reward_zero_std": 0.5625, "grad_norm": 0.31054585217832625, "kl": 0.2257080078125, "learning_rate": 1.4609375000000001e-05, "loss": 0.0064, "num_tokens": 22606944.0, "reward": 1.015625, "reward_std": 0.1607646942138672, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 649.0, "completions/max_terminated_length": 649.0, "completions/mean_length": 314.421875, "completions/mean_terminated_length": 314.421875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.07394727832933927, "frac_reward_zero_std": 0.625, "grad_norm": 0.392118215277087, "kl": 0.239501953125, "learning_rate": 1.4687500000000001e-05, "loss": 0.0492, "num_tokens": 22716652.0, "reward": 1.140625, "reward_std": 0.12621080875396729, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 695.0, "completions/max_terminated_length": 695.0, "completions/mean_length": 334.90625, "completions/mean_terminated_length": 334.90625, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.07433853377023525, "frac_reward_zero_std": 0.625, "grad_norm": 0.2999371733981333, "kl": 0.2294921875, "learning_rate": 1.4765625000000001e-05, "loss": 0.021, "num_tokens": 22833508.0, "reward": 1.12890625, "reward_std": 0.14383356273174286, "rewards/accuracy_reward/mean": 0.87890625, "rewards/accuracy_reward/std": 0.3268752694129944, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 599.0, "completions/max_terminated_length": 599.0, "completions/mean_length": 327.1171875, "completions/mean_terminated_length": 327.1171875, "completions/min_length": 109.0, "completions/min_terminated_length": 109.0, "epoch": 0.07472978921113121, "frac_reward_zero_std": 0.5, "grad_norm": 0.333956784586216, "kl": 0.231201171875, "learning_rate": 1.4843750000000002e-05, "loss": -0.0168, "num_tokens": 22947586.0, "reward": 1.0390625, "reward_std": 0.1786910742521286, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 787.0, "completions/max_terminated_length": 787.0, "completions/mean_length": 372.2578125, "completions/mean_terminated_length": 372.2578125, "completions/min_length": 137.0, "completions/min_terminated_length": 137.0, "epoch": 0.0751210446520272, "frac_reward_zero_std": 0.4375, "grad_norm": 0.36009358172113004, "kl": 0.2027587890625, "learning_rate": 1.4921875000000002e-05, "loss": -0.0017, "num_tokens": 23074324.0, "reward": 1.04296875, "reward_std": 0.18497256934642792, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 908.0, "completions/max_terminated_length": 908.0, "completions/mean_length": 337.2578125, "completions/mean_terminated_length": 337.2578125, "completions/min_length": 111.0, "completions/min_terminated_length": 111.0, "epoch": 0.07551230009292317, "frac_reward_zero_std": 0.625, "grad_norm": 0.2917550972308187, "kl": 0.21337890625, "learning_rate": 1.5000000000000002e-05, "loss": 0.0163, "num_tokens": 23189478.0, "reward": 1.078125, "reward_std": 0.12621080875396729, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 807.0, "completions/max_terminated_length": 807.0, "completions/mean_length": 392.3515625, "completions/mean_terminated_length": 392.3515625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07590355553381914, "frac_reward_zero_std": 0.375, "grad_norm": 0.33525730687177646, "kl": 0.199462890625, "learning_rate": 1.5078125e-05, "loss": 0.0211, "num_tokens": 23318800.0, "reward": 1.003173828125, "reward_std": 0.2192877233028412, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 349.57421875, "completions/mean_terminated_length": 349.57421875, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.07629481097471512, "frac_reward_zero_std": 0.875, "grad_norm": 0.17070731512897083, "kl": 0.220947265625, "learning_rate": 1.515625e-05, "loss": 0.0055, "num_tokens": 23439539.0, "reward": 1.234375, "reward_std": 0.04081955552101135, "rewards/accuracy_reward/mean": 0.984375, "rewards/accuracy_reward/std": 0.12426253408193588, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 991.0, "completions/max_terminated_length": 991.0, "completions/mean_length": 380.72265625, "completions/mean_terminated_length": 380.72265625, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.0766860664156111, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4184644198810761, "kl": 0.197509765625, "learning_rate": 1.5234375000000001e-05, "loss": 0.0578, "num_tokens": 23566668.0, "reward": 1.13671875, "reward_std": 0.18651601672172546, "rewards/accuracy_reward/mean": 0.88671875, "rewards/accuracy_reward/std": 0.31755712628364563, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 921.0, "completions/max_terminated_length": 921.0, "completions/mean_length": 356.140625, "completions/mean_terminated_length": 356.140625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.07707732185650706, "frac_reward_zero_std": 0.8125, "grad_norm": 0.23769452933235632, "kl": 0.2310791015625, "learning_rate": 1.5312500000000003e-05, "loss": 0.0217, "num_tokens": 23686992.0, "reward": 1.09375, "reward_std": 0.06689241528511047, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1172.0, "completions/max_terminated_length": 1172.0, "completions/mean_length": 404.84765625, "completions/mean_terminated_length": 404.84765625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.07746857729740304, "frac_reward_zero_std": 0.375, "grad_norm": 0.3417980460502612, "kl": 0.2154541015625, "learning_rate": 1.5390625e-05, "loss": -0.0043, "num_tokens": 23819113.0, "reward": 1.0540771484375, "reward_std": 0.22964538633823395, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 379.64453125, "completions/mean_terminated_length": 379.64453125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.07785983273829902, "frac_reward_zero_std": 0.625, "grad_norm": 0.3164683799247801, "kl": 0.2236328125, "learning_rate": 1.546875e-05, "loss": 0.0204, "num_tokens": 23945694.0, "reward": 1.078125, "reward_std": 0.11840169876813889, "rewards/accuracy_reward/mean": 0.828125, "rewards/accuracy_reward/std": 0.3780108094215393, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1045.0, "completions/max_terminated_length": 1045.0, "completions/mean_length": 414.29296875, "completions/mean_terminated_length": 414.29296875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.07825108817919499, "frac_reward_zero_std": 0.5, "grad_norm": 0.359568158339096, "kl": 0.2294921875, "learning_rate": 1.5546875e-05, "loss": 0.063, "num_tokens": 24080825.0, "reward": 1.139404296875, "reward_std": 0.18834996223449707, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1086.0, "completions/max_terminated_length": 1086.0, "completions/mean_length": 436.82421875, "completions/mean_terminated_length": 436.82421875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.07864234362009097, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3135349154041634, "kl": 0.2379150390625, "learning_rate": 1.5625e-05, "loss": 0.017, "num_tokens": 24223356.0, "reward": 0.9881591796875, "reward_std": 0.08818283677101135, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 481.28125, "completions/mean_terminated_length": 481.28125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.07903359906098695, "frac_reward_zero_std": 0.5625, "grad_norm": 0.27021782997628757, "kl": 0.23828125, "learning_rate": 1.5703125e-05, "loss": 0.021, "num_tokens": 24376388.0, "reward": 1.0462646484375, "reward_std": 0.18816861510276794, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1428.0, "completions/max_terminated_length": 1428.0, "completions/mean_length": 405.33984375, "completions/mean_terminated_length": 405.33984375, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.07942485450188291, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3653552015205479, "kl": 0.2742919921875, "learning_rate": 1.578125e-05, "loss": 0.0135, "num_tokens": 24510043.0, "reward": 0.9609375, "reward_std": 0.09341736882925034, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.45421501994132996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1212.0, "completions/max_terminated_length": 1212.0, "completions/mean_length": 487.09765625, "completions/mean_terminated_length": 487.09765625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.07981610994277889, "frac_reward_zero_std": 0.25, "grad_norm": 0.3424521582452758, "kl": 0.240234375, "learning_rate": 1.5859375e-05, "loss": 0.035, "num_tokens": 24665988.0, "reward": 1.0482177734375, "reward_std": 0.2598106563091278, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9833984375, "rewards/tag_count_reward/std": 0.09885508567094803, "step": 204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 499.76171875, "completions/mean_terminated_length": 499.76171875, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.08020736538367487, "frac_reward_zero_std": 0.375, "grad_norm": 0.25803943958803566, "kl": 0.2296142578125, "learning_rate": 1.59375e-05, "loss": 0.0042, "num_tokens": 24825511.0, "reward": 0.9970703125, "reward_std": 0.1492736041545868, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9765625, "rewards/tag_count_reward/std": 0.13075010478496552, "step": 205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1616.0, "completions/max_terminated_length": 1616.0, "completions/mean_length": 609.87109375, "completions/mean_terminated_length": 609.87109375, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.08059862082457084, "frac_reward_zero_std": 0.375, "grad_norm": 0.30842220980103213, "kl": 0.279052734375, "learning_rate": 1.6015625e-05, "loss": 0.0492, "num_tokens": 25009446.0, "reward": 0.958740234375, "reward_std": 0.20240747928619385, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.45421501994132996, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.05828297883272171, "step": 206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1917.0, "completions/mean_length": 671.5234375, "completions/mean_terminated_length": 649.6746215820312, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.08098987626546682, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5350360004202757, "kl": 0.284912109375, "learning_rate": 1.609375e-05, "loss": 0.0935, "num_tokens": 25220748.0, "reward": 0.9656982421875, "reward_std": 0.2181750386953354, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.76953125, "rewards/format_reward/std": 0.4219578504562378, "rewards/tag_count_reward/mean": 0.8310546875, "rewards/tag_count_reward/std": 0.31934747099876404, "step": 207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1678.0, "completions/max_terminated_length": 1678.0, "completions/mean_length": 491.28515625, "completions/mean_terminated_length": 491.28515625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.0813811317063628, "frac_reward_zero_std": 0.75, "grad_norm": 0.22748744065025617, "kl": 0.299072265625, "learning_rate": 1.6171875000000002e-05, "loss": 0.0145, "num_tokens": 25376277.0, "reward": 1.158935546875, "reward_std": 0.06827030330896378, "rewards/accuracy_reward/mean": 0.91015625, "rewards/accuracy_reward/std": 0.2865179479122162, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 713.61328125, "completions/mean_terminated_length": 692.4325561523438, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.08177238714725876, "frac_reward_zero_std": 0.375, "grad_norm": 0.25296694793448965, "kl": 0.28076171875, "learning_rate": 1.6250000000000002e-05, "loss": 0.0127, "num_tokens": 25589026.0, "reward": 0.7716064453125, "reward_std": 0.2055341899394989, "rewards/accuracy_reward/mean": 0.52734375, "rewards/accuracy_reward/std": 0.5002297759056091, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.06364604830741882, "step": 209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1568.0, "completions/max_terminated_length": 1568.0, "completions/mean_length": 520.11328125, "completions/mean_terminated_length": 520.11328125, "completions/min_length": 184.0, "completions/min_terminated_length": 184.0, "epoch": 0.08216364258815474, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2741610414171672, "kl": 0.31396484375, "learning_rate": 1.6328125000000002e-05, "loss": 0.0246, "num_tokens": 25750783.0, "reward": 1.01318359375, "reward_std": 0.20888571441173553, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 2001.0, "completions/max_terminated_length": 2001.0, "completions/mean_length": 584.79296875, "completions/mean_terminated_length": 584.79296875, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.08255489802905072, "frac_reward_zero_std": 0.5, "grad_norm": 0.2639613866843938, "kl": 0.2783203125, "learning_rate": 1.6406250000000002e-05, "loss": 0.0134, "num_tokens": 25928010.0, "reward": 0.8966064453125, "reward_std": 0.16426289081573486, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1969.0, "completions/mean_length": 585.8984375, "completions/mean_terminated_length": 580.1647338867188, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.08294615346994669, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2655346217740031, "kl": 0.3154296875, "learning_rate": 1.6484375000000003e-05, "loss": 0.0525, "num_tokens": 26107376.0, "reward": 0.88916015625, "reward_std": 0.18047067523002625, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1902.0, "completions/mean_length": 596.52734375, "completions/mean_terminated_length": 585.0984497070312, "completions/min_length": 200.0, "completions/min_terminated_length": 200.0, "epoch": 0.08333740891084267, "frac_reward_zero_std": 0.3125, "grad_norm": 0.27899446879448647, "kl": 0.29150390625, "learning_rate": 1.6562500000000003e-05, "loss": 0.0534, "num_tokens": 26289111.0, "reward": 0.921875, "reward_std": 0.24696573615074158, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.05805254727602005, "step": 213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1952.0, "completions/mean_length": 605.73046875, "completions/mean_terminated_length": 600.0745239257812, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.08372866435173865, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3021368477502115, "kl": 0.330078125, "learning_rate": 1.6640625000000003e-05, "loss": 0.0597, "num_tokens": 26474226.0, "reward": 1.0487060546875, "reward_std": 0.2560558021068573, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1833.0, "completions/max_terminated_length": 1833.0, "completions/mean_length": 493.37109375, "completions/mean_terminated_length": 493.37109375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.08411991979263461, "frac_reward_zero_std": 0.25, "grad_norm": 0.3317543965635343, "kl": 0.322509765625, "learning_rate": 1.671875e-05, "loss": 0.0148, "num_tokens": 26630465.0, "reward": 0.9310302734375, "reward_std": 0.23925837874412537, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.08903051167726517, "step": 215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1127.0, "completions/mean_length": 494.83203125, "completions/mean_terminated_length": 488.7412109375, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.08451117523353059, "frac_reward_zero_std": 0.1875, "grad_norm": 0.38276826684664395, "kl": 0.3232421875, "learning_rate": 1.6796875e-05, "loss": 0.0722, "num_tokens": 26786438.0, "reward": 0.964111328125, "reward_std": 0.21846899390220642, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.970703125, "rewards/tag_count_reward/std": 0.14218565821647644, "step": 216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1301.0, "completions/max_terminated_length": 1301.0, "completions/mean_length": 476.796875, "completions/mean_terminated_length": 476.796875, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.08490243067442657, "frac_reward_zero_std": 0.3125, "grad_norm": 0.27554661934643127, "kl": 0.340087890625, "learning_rate": 1.6875e-05, "loss": 0.0258, "num_tokens": 26936914.0, "reward": 0.9888916015625, "reward_std": 0.2022039294242859, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.09666129946708679, "step": 217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1162.0, "completions/max_terminated_length": 1162.0, "completions/mean_length": 440.2421875, "completions/mean_terminated_length": 440.2421875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.08529368611532254, "frac_reward_zero_std": 0.25, "grad_norm": 0.30498387591136494, "kl": 0.288330078125, "learning_rate": 1.6953125e-05, "loss": 0.0172, "num_tokens": 27080048.0, "reward": 1.085205078125, "reward_std": 0.21793940663337708, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 942.0, "completions/max_terminated_length": 942.0, "completions/mean_length": 369.984375, "completions/mean_terminated_length": 369.984375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.08568494155621852, "frac_reward_zero_std": 0.4375, "grad_norm": 0.33439173503271047, "kl": 0.318359375, "learning_rate": 1.703125e-05, "loss": 0.0097, "num_tokens": 27203260.0, "reward": 0.9921875, "reward_std": 0.20521603524684906, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1147.0, "completions/max_terminated_length": 1147.0, "completions/mean_length": 423.59765625, "completions/mean_terminated_length": 423.59765625, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.0860761969971145, "frac_reward_zero_std": 0.5, "grad_norm": 0.3375422232355613, "kl": 0.2666015625, "learning_rate": 1.7109375e-05, "loss": 0.0217, "num_tokens": 27341541.0, "reward": 1.125, "reward_std": 0.16406384110450745, "rewards/accuracy_reward/mean": 0.875, "rewards/accuracy_reward/std": 0.33136674761772156, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 876.0, "completions/max_terminated_length": 876.0, "completions/mean_length": 372.08203125, "completions/mean_terminated_length": 372.08203125, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.08646745243801046, "frac_reward_zero_std": 0.5, "grad_norm": 0.29399569834177397, "kl": 0.2783203125, "learning_rate": 1.71875e-05, "loss": 0.0226, "num_tokens": 27464570.0, "reward": 1.0111083984375, "reward_std": 0.13943463563919067, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1244.0, "completions/max_terminated_length": 1244.0, "completions/mean_length": 444.9765625, "completions/mean_terminated_length": 444.9765625, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.08685870787890644, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26129553562339936, "kl": 0.261474609375, "learning_rate": 1.7265625e-05, "loss": 0.0029, "num_tokens": 27608180.0, "reward": 1.0859375, "reward_std": 0.15404410660266876, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1365.0, "completions/max_terminated_length": 1365.0, "completions/mean_length": 422.26953125, "completions/mean_terminated_length": 422.26953125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.08724996331980242, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4752998730147662, "kl": 0.2564697265625, "learning_rate": 1.734375e-05, "loss": 0.0389, "num_tokens": 27745881.0, "reward": 1.0230712890625, "reward_std": 0.15446604788303375, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0464647077023983, "step": 223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1556.0, "completions/max_terminated_length": 1556.0, "completions/mean_length": 486.84765625, "completions/mean_terminated_length": 486.84765625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.08764121876069839, "frac_reward_zero_std": 0.5625, "grad_norm": 0.26272142847244373, "kl": 0.2635498046875, "learning_rate": 1.7421875e-05, "loss": 0.0159, "num_tokens": 27897666.0, "reward": 0.96875, "reward_std": 0.1816575825214386, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1239.0, "completions/max_terminated_length": 1239.0, "completions/mean_length": 374.12109375, "completions/mean_terminated_length": 374.12109375, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.08803247420159437, "frac_reward_zero_std": 0.5625, "grad_norm": 0.40356120979319227, "kl": 0.297607421875, "learning_rate": 1.7500000000000002e-05, "loss": 0.042, "num_tokens": 28022337.0, "reward": 1.10498046875, "reward_std": 0.13752326369285583, "rewards/accuracy_reward/mean": 0.85546875, "rewards/accuracy_reward/std": 0.35231640934944153, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1405.0, "completions/mean_length": 540.04296875, "completions/mean_terminated_length": 452.8057556152344, "completions/min_length": 187.0, "completions/min_terminated_length": 187.0, "epoch": 0.08842372964249035, "frac_reward_zero_std": 0.25, "grad_norm": 0.5266644959752, "kl": 0.267822265625, "learning_rate": 1.7578125000000002e-05, "loss": 0.2953, "num_tokens": 28190444.0, "reward": 0.9698486328125, "reward_std": 0.1798657774925232, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.20342643558979034, "step": 226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1168.0, "completions/max_terminated_length": 1168.0, "completions/mean_length": 379.08984375, "completions/mean_terminated_length": 379.08984375, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.08881498508338631, "frac_reward_zero_std": 0.75, "grad_norm": 8.239222706736252, "kl": 1.702392578125, "learning_rate": 1.7656250000000002e-05, "loss": 0.0366, "num_tokens": 28315987.0, "reward": 1.15576171875, "reward_std": 0.06884554028511047, "rewards/accuracy_reward/mean": 0.90625, "rewards/accuracy_reward/std": 0.2920515835285187, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.0625, "step": 227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1574.0, "completions/max_terminated_length": 1574.0, "completions/mean_length": 441.37890625, "completions/mean_terminated_length": 441.37890625, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.08920624052428229, "frac_reward_zero_std": 0.5, "grad_norm": 0.2848324889799531, "kl": 0.24658203125, "learning_rate": 1.7734375000000002e-05, "loss": -0.0009, "num_tokens": 28459604.0, "reward": 0.9765625, "reward_std": 0.18143153190612793, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2007.0, "completions/mean_length": 458.5546875, "completions/mean_terminated_length": 452.32159423828125, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.08959749596517827, "frac_reward_zero_std": 0.0, "grad_norm": 0.7474755869430539, "kl": 0.326416015625, "learning_rate": 1.7812500000000003e-05, "loss": 0.0462, "num_tokens": 28606482.0, "reward": 0.8121337890625, "reward_std": 0.27084317803382874, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.56640625, "rewards/format_reward/std": 0.4965413510799408, "rewards/tag_count_reward/mean": 0.8369140625, "rewards/tag_count_reward/std": 0.24657388031482697, "step": 229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1630.0, "completions/max_terminated_length": 1630.0, "completions/mean_length": 446.41015625, "completions/mean_terminated_length": 446.41015625, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.08998875140607424, "frac_reward_zero_std": 0.75, "grad_norm": 0.1928226579608651, "kl": 0.23974609375, "learning_rate": 1.7890625000000003e-05, "loss": 0.0367, "num_tokens": 28750571.0, "reward": 1.1640625, "reward_std": 0.07206955552101135, "rewards/accuracy_reward/mean": 0.9140625, "rewards/accuracy_reward/std": 0.28082075715065, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1328.0, "completions/max_terminated_length": 1328.0, "completions/mean_length": 488.921875, "completions/mean_terminated_length": 488.921875, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.09038000684697022, "frac_reward_zero_std": 0.125, "grad_norm": 0.3582954300987468, "kl": 0.278564453125, "learning_rate": 1.7968750000000003e-05, "loss": 0.0181, "num_tokens": 28905255.0, "reward": 1.0499267578125, "reward_std": 0.2764754593372345, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.87109375, "rewards/format_reward/std": 0.33575257658958435, "rewards/tag_count_reward/mean": 0.9345703125, "rewards/tag_count_reward/std": 0.17813360691070557, "step": 231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1329.0, "completions/max_terminated_length": 1329.0, "completions/mean_length": 408.09765625, "completions/mean_terminated_length": 408.09765625, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.0907712622878662, "frac_reward_zero_std": 0.4375, "grad_norm": 0.319572427381139, "kl": 0.32177734375, "learning_rate": 1.8046875e-05, "loss": 0.012, "num_tokens": 29041120.0, "reward": 0.9976806640625, "reward_std": 0.17539697885513306, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.0554114393889904, "step": 232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1073.0, "completions/max_terminated_length": 1073.0, "completions/mean_length": 406.828125, "completions/mean_terminated_length": 406.828125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.09116251772876216, "frac_reward_zero_std": 0.375, "grad_norm": 0.29207976927999113, "kl": 0.3740234375, "learning_rate": 1.8125e-05, "loss": 0.0074, "num_tokens": 29174548.0, "reward": 0.9825439453125, "reward_std": 0.19916865229606628, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1056.0, "completions/max_terminated_length": 1056.0, "completions/mean_length": 464.5625, "completions/mean_terminated_length": 464.5625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.09155377316965814, "frac_reward_zero_std": 0.25, "grad_norm": 0.34285649746189734, "kl": 0.319580078125, "learning_rate": 1.8203125e-05, "loss": 0.0709, "num_tokens": 29322084.0, "reward": 1.002197265625, "reward_std": 0.25457847118377686, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1954.0, "completions/max_terminated_length": 1954.0, "completions/mean_length": 429.12890625, "completions/mean_terminated_length": 429.12890625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09194502861055412, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3344524680337601, "kl": 0.39599609375, "learning_rate": 1.828125e-05, "loss": -0.0067, "num_tokens": 29459941.0, "reward": 1.0535888671875, "reward_std": 0.2940719425678253, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1563.0, "completions/max_terminated_length": 1563.0, "completions/mean_length": 529.95703125, "completions/mean_terminated_length": 529.95703125, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.09233628405145008, "frac_reward_zero_std": 0.0, "grad_norm": 0.3233747713131115, "kl": 0.3388671875, "learning_rate": 1.8359375e-05, "loss": 0.0473, "num_tokens": 29626938.0, "reward": 0.8787841796875, "reward_std": 0.41007477045059204, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.05999100208282471, "step": 236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 932.0, "completions/max_terminated_length": 932.0, "completions/mean_length": 369.73046875, "completions/mean_terminated_length": 369.73046875, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09272753949234606, "frac_reward_zero_std": 0.375, "grad_norm": 0.3408988045712947, "kl": 0.426513671875, "learning_rate": 1.84375e-05, "loss": 0.0361, "num_tokens": 29750277.0, "reward": 1.0880126953125, "reward_std": 0.21140283346176147, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1482.0, "completions/mean_length": 471.72265625, "completions/mean_terminated_length": 465.54119873046875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.09311879493324204, "frac_reward_zero_std": 0.3125, "grad_norm": 0.32028641386663737, "kl": 0.380859375, "learning_rate": 1.8515625e-05, "loss": 0.051, "num_tokens": 29898878.0, "reward": 1.0216064453125, "reward_std": 0.2399899661540985, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 376.2578125, "completions/mean_terminated_length": 376.2578125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.09351005037413801, "frac_reward_zero_std": 0.3125, "grad_norm": 0.31674756961339046, "kl": 0.38671875, "learning_rate": 1.859375e-05, "loss": 0.0409, "num_tokens": 30024128.0, "reward": 0.932373046875, "reward_std": 0.22485382854938507, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1694.0, "completions/max_terminated_length": 1694.0, "completions/mean_length": 416.54296875, "completions/mean_terminated_length": 416.54296875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.09390130581503399, "frac_reward_zero_std": 0.125, "grad_norm": 0.44357985485662815, "kl": 0.42626953125, "learning_rate": 1.8671875e-05, "loss": 0.0591, "num_tokens": 30163099.0, "reward": 0.9786376953125, "reward_std": 0.35346323251724243, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1059.0, "completions/max_terminated_length": 1059.0, "completions/mean_length": 386.30859375, "completions/mean_terminated_length": 386.30859375, "completions/min_length": 118.0, "completions/min_terminated_length": 118.0, "epoch": 0.09429256125592997, "frac_reward_zero_std": 0.25, "grad_norm": 0.3496164783972592, "kl": 0.40380859375, "learning_rate": 1.8750000000000002e-05, "loss": 0.034, "num_tokens": 30291114.0, "reward": 0.98193359375, "reward_std": 0.2954230010509491, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1386.0, "completions/max_terminated_length": 1386.0, "completions/mean_length": 362.625, "completions/mean_terminated_length": 362.625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.09468381669682593, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3945711657183597, "kl": 0.5009765625, "learning_rate": 1.8828125000000002e-05, "loss": 0.0414, "num_tokens": 30414570.0, "reward": 0.99462890625, "reward_std": 0.26441827416419983, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.06537505239248276, "step": 242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 366.59765625, "completions/mean_terminated_length": 366.59765625, "completions/min_length": 73.0, "completions/min_terminated_length": 73.0, "epoch": 0.09507507213772191, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3987537041850179, "kl": 0.4853515625, "learning_rate": 1.8906250000000002e-05, "loss": 0.0499, "num_tokens": 30537411.0, "reward": 0.94677734375, "reward_std": 0.31390509009361267, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1870.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 398.76171875, "completions/mean_terminated_length": 398.76171875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.0954663275786179, "frac_reward_zero_std": 0.1875, "grad_norm": 0.38757648055895033, "kl": 0.447998046875, "learning_rate": 1.8984375000000002e-05, "loss": 0.0486, "num_tokens": 30668374.0, "reward": 1.087890625, "reward_std": 0.2919348180294037, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06957504153251648, "step": 244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1039.0, "completions/max_terminated_length": 1039.0, "completions/mean_length": 301.546875, "completions/mean_terminated_length": 301.546875, "completions/min_length": 84.0, "completions/min_terminated_length": 84.0, "epoch": 0.09585758301951386, "frac_reward_zero_std": 0.1875, "grad_norm": 0.5064487843545451, "kl": 0.51220703125, "learning_rate": 1.9062500000000003e-05, "loss": 0.0173, "num_tokens": 30772962.0, "reward": 1.1156005859375, "reward_std": 0.16597115993499756, "rewards/accuracy_reward/mean": 0.87109375, "rewards/accuracy_reward/std": 0.33575257658958435, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9677734375, "rewards/tag_count_reward/std": 0.11999396979808807, "step": 245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1340.0, "completions/max_terminated_length": 1340.0, "completions/mean_length": 476.99609375, "completions/mean_terminated_length": 476.99609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.09624883846040984, "frac_reward_zero_std": 0.125, "grad_norm": 0.4128012091605713, "kl": 0.4072265625, "learning_rate": 1.9140625000000003e-05, "loss": 0.0698, "num_tokens": 30924753.0, "reward": 0.8931884765625, "reward_std": 0.33010074496269226, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1446.0, "completions/max_terminated_length": 1446.0, "completions/mean_length": 535.07421875, "completions/mean_terminated_length": 535.07421875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.09664009390130582, "frac_reward_zero_std": 0.3125, "grad_norm": 0.30304942990127715, "kl": 0.43701171875, "learning_rate": 1.9218750000000003e-05, "loss": 0.0328, "num_tokens": 31092116.0, "reward": 0.9407958984375, "reward_std": 0.2804175019264221, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1771.0, "completions/max_terminated_length": 1771.0, "completions/mean_length": 514.38671875, "completions/mean_terminated_length": 514.38671875, "completions/min_length": 97.0, "completions/min_terminated_length": 97.0, "epoch": 0.09703134934220178, "frac_reward_zero_std": 0.375, "grad_norm": 0.37443311429696635, "kl": 0.46337890625, "learning_rate": 1.9296875000000003e-05, "loss": 0.022, "num_tokens": 31254007.0, "reward": 1.0540771484375, "reward_std": 0.24119773507118225, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1518.0, "completions/max_terminated_length": 1518.0, "completions/mean_length": 408.0859375, "completions/mean_terminated_length": 408.0859375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09742260478309776, "frac_reward_zero_std": 0.125, "grad_norm": 0.5175489147692418, "kl": 0.484130859375, "learning_rate": 1.9375e-05, "loss": -0.0041, "num_tokens": 31385837.0, "reward": 1.0474853515625, "reward_std": 0.26607465744018555, "rewards/accuracy_reward/mean": 0.82421875, "rewards/accuracy_reward/std": 0.3813795745372772, "rewards/format_reward/mean": 0.85546875, "rewards/format_reward/std": 0.35231640934944153, "rewards/tag_count_reward/mean": 0.9306640625, "rewards/tag_count_reward/std": 0.17243115603923798, "step": 249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1751.0, "completions/max_terminated_length": 1751.0, "completions/mean_length": 575.83984375, "completions/mean_terminated_length": 575.83984375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.09781386022399374, "frac_reward_zero_std": 0.0625, "grad_norm": 0.31554048485420655, "kl": 0.44970703125, "learning_rate": 1.9453125e-05, "loss": 0.0592, "num_tokens": 31563156.0, "reward": 0.8809814453125, "reward_std": 0.3404023349285126, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1352.0, "completions/mean_length": 410.515625, "completions/mean_terminated_length": 404.0941467285156, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.09820511566488971, "frac_reward_zero_std": 0.375, "grad_norm": 0.35881506806220603, "kl": 0.492431640625, "learning_rate": 1.953125e-05, "loss": 0.0842, "num_tokens": 31698072.0, "reward": 1.10009765625, "reward_std": 0.21724170446395874, "rewards/accuracy_reward/mean": 0.8515625, "rewards/accuracy_reward/std": 0.3562295734882355, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1181.0, "completions/max_terminated_length": 1181.0, "completions/mean_length": 433.921875, "completions/mean_terminated_length": 433.921875, "completions/min_length": 76.0, "completions/min_terminated_length": 76.0, "epoch": 0.09859637110578569, "frac_reward_zero_std": 0.25, "grad_norm": 0.37139439977366356, "kl": 0.513427734375, "learning_rate": 1.9609375e-05, "loss": -0.0137, "num_tokens": 31838708.0, "reward": 0.9051513671875, "reward_std": 0.30087730288505554, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2000.0, "completions/mean_length": 507.203125, "completions/mean_terminated_length": 501.16082763671875, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.09898762654668167, "frac_reward_zero_std": 0.25, "grad_norm": 0.2979926471163034, "kl": 0.439697265625, "learning_rate": 1.96875e-05, "loss": 0.0282, "num_tokens": 31996504.0, "reward": 0.8719482421875, "reward_std": 0.2662973403930664, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1785.0, "completions/max_terminated_length": 1785.0, "completions/mean_length": 486.12109375, "completions/mean_terminated_length": 486.12109375, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.09937888198757763, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3252385703624758, "kl": 0.522216796875, "learning_rate": 1.9765625e-05, "loss": 0.077, "num_tokens": 32149143.0, "reward": 1.0150146484375, "reward_std": 0.2716185450553894, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 963.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 373.6484375, "completions/mean_terminated_length": 373.6484375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.09977013742847361, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4609963859842547, "kl": 0.6630859375, "learning_rate": 1.984375e-05, "loss": 0.0256, "num_tokens": 32274397.0, "reward": 1.0247802734375, "reward_std": 0.183428093791008, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.91015625, "rewards/format_reward/std": 0.2865179479122162, "rewards/tag_count_reward/mean": 0.9755859375, "rewards/tag_count_reward/std": 0.1021324098110199, "step": 255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1875.0, "completions/max_terminated_length": 1875.0, "completions/mean_length": 471.3046875, "completions/mean_terminated_length": 471.3046875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.10016139286936959, "frac_reward_zero_std": 0.25, "grad_norm": 0.3708630007110027, "kl": 0.574462890625, "learning_rate": 1.9921875e-05, "loss": 0.0609, "num_tokens": 32424891.0, "reward": 0.807373046875, "reward_std": 0.27390164136886597, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1257.0, "completions/max_terminated_length": 1257.0, "completions/mean_length": 392.5390625, "completions/mean_terminated_length": 392.5390625, "completions/min_length": 94.0, "completions/min_terminated_length": 94.0, "epoch": 0.10055264831026556, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3713457785918941, "kl": 0.6064453125, "learning_rate": 2e-05, "loss": 0.0693, "num_tokens": 32557541.0, "reward": 0.9921875, "reward_std": 0.20597384870052338, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1677.0, "completions/mean_length": 416.00390625, "completions/mean_terminated_length": 409.60394287109375, "completions/min_length": 87.0, "completions/min_terminated_length": 87.0, "epoch": 0.10094390375116154, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3748503218431077, "kl": 0.61572265625, "learning_rate": 1.9999990671452868e-05, "loss": 0.0858, "num_tokens": 32692022.0, "reward": 0.96337890625, "reward_std": 0.23298104107379913, "rewards/accuracy_reward/mean": 0.71484375, "rewards/accuracy_reward/std": 0.4523732364177704, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 958.0, "completions/max_terminated_length": 958.0, "completions/mean_length": 358.38671875, "completions/mean_terminated_length": 358.38671875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.10133515919205752, "frac_reward_zero_std": 0.25, "grad_norm": 0.47931837873844696, "kl": 0.669921875, "learning_rate": 1.9999962685828877e-05, "loss": 0.0961, "num_tokens": 32811881.0, "reward": 1.0653076171875, "reward_std": 0.24143050611019135, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1179.0, "completions/max_terminated_length": 1179.0, "completions/mean_length": 397.66796875, "completions/mean_terminated_length": 397.66796875, "completions/min_length": 93.0, "completions/min_terminated_length": 93.0, "epoch": 0.1017264146329535, "frac_reward_zero_std": 0.0625, "grad_norm": 0.4043716580301221, "kl": 0.5732421875, "learning_rate": 1.999991604318023e-05, "loss": -0.0001, "num_tokens": 32943908.0, "reward": 0.9429931640625, "reward_std": 0.34864363074302673, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 951.0, "completions/max_terminated_length": 951.0, "completions/mean_length": 342.08203125, "completions/mean_terminated_length": 342.08203125, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.10211767007384946, "frac_reward_zero_std": 0.25, "grad_norm": 0.4367084027968529, "kl": 0.70703125, "learning_rate": 1.9999850743593963e-05, "loss": 0.0518, "num_tokens": 33061161.0, "reward": 1.0958251953125, "reward_std": 0.26021116971969604, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1075.0, "completions/max_terminated_length": 1075.0, "completions/mean_length": 371.9765625, "completions/mean_terminated_length": 371.9765625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.10250892551474544, "frac_reward_zero_std": 0.375, "grad_norm": 0.3928379347229735, "kl": 0.60595703125, "learning_rate": 1.9999766787191897e-05, "loss": 0.0258, "num_tokens": 33185123.0, "reward": 1.0062255859375, "reward_std": 0.22519409656524658, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 375.796875, "completions/mean_terminated_length": 375.796875, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.10290018095564142, "frac_reward_zero_std": 0.375, "grad_norm": 0.3703820177102917, "kl": 0.60205078125, "learning_rate": 1.999966417413067e-05, "loss": 0.0134, "num_tokens": 33310607.0, "reward": 0.984375, "reward_std": 0.23140643537044525, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1020.0, "completions/max_terminated_length": 1020.0, "completions/mean_length": 357.88671875, "completions/mean_terminated_length": 357.88671875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.10329143639653739, "frac_reward_zero_std": 0.375, "grad_norm": 0.4049791129392889, "kl": 0.64599609375, "learning_rate": 1.9999542904601733e-05, "loss": 0.0151, "num_tokens": 33430738.0, "reward": 1.00341796875, "reward_std": 0.2254645824432373, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1121.0, "completions/max_terminated_length": 1121.0, "completions/mean_length": 400.16015625, "completions/mean_terminated_length": 400.16015625, "completions/min_length": 115.0, "completions/min_terminated_length": 115.0, "epoch": 0.10368269183743337, "frac_reward_zero_std": 0.25, "grad_norm": 0.41426659016088174, "kl": 0.59375, "learning_rate": 1.999940297883134e-05, "loss": 0.0438, "num_tokens": 33564443.0, "reward": 0.9017333984375, "reward_std": 0.2900923788547516, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1226.0, "completions/max_terminated_length": 1226.0, "completions/mean_length": 425.3515625, "completions/mean_terminated_length": 425.3515625, "completions/min_length": 91.0, "completions/min_terminated_length": 91.0, "epoch": 0.10407394727832935, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3608508707350292, "kl": 0.62060546875, "learning_rate": 1.9999244397080545e-05, "loss": 0.0364, "num_tokens": 33701877.0, "reward": 0.8916015625, "reward_std": 0.25061821937561035, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1010.0, "completions/max_terminated_length": 1010.0, "completions/mean_length": 417.24609375, "completions/mean_terminated_length": 417.24609375, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.10446520271922531, "frac_reward_zero_std": 0.1875, "grad_norm": 0.40266041501894967, "kl": 0.568359375, "learning_rate": 1.9999067159645222e-05, "loss": 0.034, "num_tokens": 33839748.0, "reward": 0.9759521484375, "reward_std": 0.3260706663131714, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 402.56640625, "completions/mean_terminated_length": 402.56640625, "completions/min_length": 160.0, "completions/min_terminated_length": 160.0, "epoch": 0.10485645816012129, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4173263848634124, "kl": 0.60693359375, "learning_rate": 1.9998871266856043e-05, "loss": 0.0299, "num_tokens": 33972453.0, "reward": 0.97900390625, "reward_std": 0.3372042179107666, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1104.0, "completions/max_terminated_length": 1104.0, "completions/mean_length": 373.6953125, "completions/mean_terminated_length": 373.6953125, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.10524771360101727, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3652682411607663, "kl": 0.64208984375, "learning_rate": 1.9998656719078485e-05, "loss": 0.045, "num_tokens": 34097591.0, "reward": 1.000732421875, "reward_std": 0.144551083445549, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0539139099419117, "step": 269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1218.0, "completions/max_terminated_length": 1218.0, "completions/mean_length": 325.5546875, "completions/mean_terminated_length": 325.5546875, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.10563896904191324, "frac_reward_zero_std": 0.375, "grad_norm": 0.3801112502303859, "kl": 0.65087890625, "learning_rate": 1.999842351671283e-05, "loss": 0.0258, "num_tokens": 34209893.0, "reward": 1.09130859375, "reward_std": 0.18691867589950562, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 376.37890625, "completions/mean_terminated_length": 376.37890625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.10603022448280922, "frac_reward_zero_std": 0.1875, "grad_norm": 0.39842817639971656, "kl": 0.5810546875, "learning_rate": 1.999817166019417e-05, "loss": 0.0277, "num_tokens": 34335398.0, "reward": 1.018798828125, "reward_std": 0.3150246739387512, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1945.0, "completions/max_terminated_length": 1945.0, "completions/mean_length": 458.50390625, "completions/mean_terminated_length": 458.50390625, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1064214799237052, "frac_reward_zero_std": 0.5625, "grad_norm": 0.31200361717311675, "kl": 0.54736328125, "learning_rate": 1.99979011499924e-05, "loss": 0.0365, "num_tokens": 34482791.0, "reward": 1.01953125, "reward_std": 0.1780368983745575, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1775.0, "completions/max_terminated_length": 1775.0, "completions/mean_length": 472.00390625, "completions/mean_terminated_length": 472.00390625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.10681273536460116, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3552786593837735, "kl": 0.5751953125, "learning_rate": 1.9997611986612203e-05, "loss": 0.1029, "num_tokens": 34634456.0, "reward": 1.083740234375, "reward_std": 0.27202826738357544, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.0661611557006836, "step": 273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1930.0, "completions/mean_length": 600.375, "completions/mean_terminated_length": 583.2095336914062, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.10720399080549714, "frac_reward_zero_std": 0.4375, "grad_norm": 0.28350301114127724, "kl": 0.60009765625, "learning_rate": 1.9997304170593083e-05, "loss": 0.0435, "num_tokens": 34816808.0, "reward": 0.8524169921875, "reward_std": 0.2222909927368164, "rewards/accuracy_reward/mean": 0.60546875, "rewards/accuracy_reward/std": 0.48970720171928406, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2020.0, "completions/mean_length": 623.09765625, "completions/mean_terminated_length": 611.8779296875, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.10759524624639312, "frac_reward_zero_std": 0.3125, "grad_norm": 0.287249565720478, "kl": 0.54736328125, "learning_rate": 1.9996977702509332e-05, "loss": 0.1053, "num_tokens": 35007649.0, "reward": 0.88671875, "reward_std": 0.2731313407421112, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1959.0, "completions/mean_length": 692.515625, "completions/mean_terminated_length": 665.5139770507812, "completions/min_length": 209.0, "completions/min_terminated_length": 209.0, "epoch": 0.10798650168728909, "frac_reward_zero_std": 0.25, "grad_norm": 0.3033225447278893, "kl": 0.55908203125, "learning_rate": 1.999663258297004e-05, "loss": 0.0902, "num_tokens": 35212773.0, "reward": 1.0882568359375, "reward_std": 0.256381094455719, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.08034826815128326, "step": 276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.8671875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1839.0, "completions/mean_length": 746.15625, "completions/mean_terminated_length": 653.5564575195312, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.10837775712818506, "frac_reward_zero_std": 0.25, "grad_norm": 0.28998664189325846, "kl": 0.52783203125, "learning_rate": 1.999626881261911e-05, "loss": 0.1304, "num_tokens": 35433693.0, "reward": 0.928466796875, "reward_std": 0.31398987770080566, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.91796875, "rewards/format_reward/std": 0.2749498784542084, "rewards/tag_count_reward/mean": 0.978515625, "rewards/tag_count_reward/std": 0.07361361384391785, "step": 277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9296875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2027.0, "completions/mean_length": 733.57421875, "completions/mean_terminated_length": 685.68017578125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.10876901256908104, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2700561427568146, "kl": 0.535888671875, "learning_rate": 1.999588639213522e-05, "loss": 0.0391, "num_tokens": 35651200.0, "reward": 0.9046630859375, "reward_std": 0.24476918578147888, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1818.0, "completions/mean_length": 571.125, "completions/mean_terminated_length": 559.4960327148438, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.10916026800997701, "frac_reward_zero_std": 0.5, "grad_norm": 0.22600822274406884, "kl": 0.54931640625, "learning_rate": 1.9995485322231866e-05, "loss": 0.0323, "num_tokens": 35825136.0, "reward": 0.9822998046875, "reward_std": 0.16083058714866638, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1778.0, "completions/max_terminated_length": 1778.0, "completions/mean_length": 514.32421875, "completions/mean_terminated_length": 514.32421875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.10955152345087299, "frac_reward_zero_std": 0.5625, "grad_norm": 0.2454443885271576, "kl": 0.6376953125, "learning_rate": 1.9995065603657317e-05, "loss": 0.0238, "num_tokens": 35985123.0, "reward": 1.089111328125, "reward_std": 0.1212083101272583, "rewards/accuracy_reward/mean": 0.83984375, "rewards/accuracy_reward/std": 0.36746934056282043, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2021.0, "completions/mean_length": 813.41015625, "completions/mean_terminated_length": 793.8135375976562, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.10994277889176897, "frac_reward_zero_std": 0.25, "grad_norm": 0.21592464960901983, "kl": 0.484375, "learning_rate": 1.9994627237194654e-05, "loss": 0.0909, "num_tokens": 36223436.0, "reward": 0.7821044921875, "reward_std": 0.3241814076900482, "rewards/accuracy_reward/mean": 0.53515625, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 544.50390625, "completions/mean_terminated_length": 544.50390625, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.11033403433266493, "frac_reward_zero_std": 0.25, "grad_norm": 0.27407449216858365, "kl": 0.5712890625, "learning_rate": 1.999417022366174e-05, "loss": 0.0239, "num_tokens": 36392845.0, "reward": 0.8680419921875, "reward_std": 0.3163914680480957, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1886.0, "completions/mean_length": 634.31640625, "completions/mean_terminated_length": 617.5534057617188, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.11072528977356091, "frac_reward_zero_std": 0.25, "grad_norm": 0.2645487909290685, "kl": 0.56201171875, "learning_rate": 1.999369456391123e-05, "loss": 0.1083, "num_tokens": 36585950.0, "reward": 0.8233642578125, "reward_std": 0.29639238119125366, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1948.0, "completions/mean_length": 515.78125, "completions/mean_terminated_length": 509.7725830078125, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1111165452144569, "frac_reward_zero_std": 0.5, "grad_norm": 0.2865193603116024, "kl": 0.58056640625, "learning_rate": 1.999320025883057e-05, "loss": 0.0492, "num_tokens": 36747110.0, "reward": 1.1153564453125, "reward_std": 0.16979053616523743, "rewards/accuracy_reward/mean": 0.8671875, "rewards/accuracy_reward/std": 0.3400367796421051, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1134.0, "completions/max_terminated_length": 1134.0, "completions/mean_length": 383.0234375, "completions/mean_terminated_length": 383.0234375, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.11150780065535286, "frac_reward_zero_std": 0.375, "grad_norm": 0.32889672152688665, "kl": 0.64892578125, "learning_rate": 1.9992687309341976e-05, "loss": 0.0236, "num_tokens": 36874284.0, "reward": 0.91015625, "reward_std": 0.2455040067434311, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1385.0, "completions/max_terminated_length": 1385.0, "completions/mean_length": 436.00390625, "completions/mean_terminated_length": 436.00390625, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11189905609624884, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3089776414038001, "kl": 0.63818359375, "learning_rate": 1.9992155716402478e-05, "loss": 0.0088, "num_tokens": 37015853.0, "reward": 0.800048828125, "reward_std": 0.29239729046821594, "rewards/accuracy_reward/mean": 0.55078125, "rewards/accuracy_reward/std": 0.49838894605636597, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1475.0, "completions/max_terminated_length": 1475.0, "completions/mean_length": 378.28515625, "completions/mean_terminated_length": 378.28515625, "completions/min_length": 110.0, "completions/min_terminated_length": 110.0, "epoch": 0.11229031153714482, "frac_reward_zero_std": 0.375, "grad_norm": 0.3051112492357909, "kl": 0.66357421875, "learning_rate": 1.9991605481003865e-05, "loss": 0.0617, "num_tokens": 37140854.0, "reward": 0.99609375, "reward_std": 0.24249455332756042, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1836.0, "completions/mean_length": 462.5625, "completions/mean_terminated_length": 456.3451232910156, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.11268156697804078, "frac_reward_zero_std": 0.25, "grad_norm": 0.3371716422324308, "kl": 0.66162109375, "learning_rate": 1.9991036604172725e-05, "loss": 0.0394, "num_tokens": 37290038.0, "reward": 0.8939208984375, "reward_std": 0.3249357342720032, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1167.0, "completions/max_terminated_length": 1167.0, "completions/mean_length": 361.79296875, "completions/mean_terminated_length": 361.79296875, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.11307282241893676, "frac_reward_zero_std": 0.1875, "grad_norm": 0.37098726315795555, "kl": 0.70361328125, "learning_rate": 1.9990449086970404e-05, "loss": 0.0439, "num_tokens": 37413361.0, "reward": 0.932373046875, "reward_std": 0.3588939905166626, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1154.0, "completions/max_terminated_length": 1154.0, "completions/mean_length": 373.1328125, "completions/mean_terminated_length": 373.1328125, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.11346407785983274, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3326371149749546, "kl": 0.6640625, "learning_rate": 1.998984293049305e-05, "loss": 0.0531, "num_tokens": 37537731.0, "reward": 0.99609375, "reward_std": 0.24727347493171692, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1023.0, "completions/max_terminated_length": 1023.0, "completions/mean_length": 387.16796875, "completions/mean_terminated_length": 387.16796875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.11385533330072871, "frac_reward_zero_std": 0.375, "grad_norm": 0.2960210886965262, "kl": 0.70556640625, "learning_rate": 1.998921813587157e-05, "loss": 0.0091, "num_tokens": 37666926.0, "reward": 0.780029296875, "reward_std": 0.22023949027061462, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1845.0, "completions/mean_length": 412.0703125, "completions/mean_terminated_length": 399.18896484375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.11424658874162469, "frac_reward_zero_std": 0.25, "grad_norm": 0.3245364886686712, "kl": 0.66357421875, "learning_rate": 1.9988574704271652e-05, "loss": 0.0531, "num_tokens": 37801296.0, "reward": 0.9813232421875, "reward_std": 0.28665536642074585, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1197.0, "completions/mean_length": 389.19140625, "completions/mean_terminated_length": 382.6863098144531, "completions/min_length": 152.0, "completions/min_terminated_length": 152.0, "epoch": 0.11463784418252067, "frac_reward_zero_std": 0.25, "grad_norm": 0.35802445863691257, "kl": 0.69384765625, "learning_rate": 1.9987912636893744e-05, "loss": 0.0579, "num_tokens": 37930113.0, "reward": 1.0262451171875, "reward_std": 0.30891746282577515, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1508.0, "completions/max_terminated_length": 1508.0, "completions/mean_length": 392.5234375, "completions/mean_terminated_length": 392.5234375, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.11502909962341663, "frac_reward_zero_std": 0.3125, "grad_norm": 0.36045826836035694, "kl": 0.79736328125, "learning_rate": 1.9987231934973082e-05, "loss": 0.036, "num_tokens": 38058583.0, "reward": 0.893310546875, "reward_std": 0.25631842017173767, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 981.0, "completions/mean_length": 346.5078125, "completions/mean_terminated_length": 339.8353271484375, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.11542035506431261, "frac_reward_zero_std": 0.5, "grad_norm": 0.4120501735095288, "kl": 0.62451171875, "learning_rate": 1.9986532599779653e-05, "loss": 0.0593, "num_tokens": 38176713.0, "reward": 1.0501708984375, "reward_std": 0.18089252710342407, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1062.0, "completions/max_terminated_length": 1062.0, "completions/mean_length": 333.9375, "completions/mean_terminated_length": 333.9375, "completions/min_length": 100.0, "completions/min_terminated_length": 100.0, "epoch": 0.11581161050520859, "frac_reward_zero_std": 0.375, "grad_norm": 0.3355471294666379, "kl": 0.740234375, "learning_rate": 1.998581463261821e-05, "loss": 0.0008, "num_tokens": 38291609.0, "reward": 0.936279296875, "reward_std": 0.26775258779525757, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 979.0, "completions/max_terminated_length": 979.0, "completions/mean_length": 364.32421875, "completions/mean_terminated_length": 364.32421875, "completions/min_length": 146.0, "completions/min_terminated_length": 146.0, "epoch": 0.11620286594610456, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3162658193434941, "kl": 0.61474609375, "learning_rate": 1.998507803482828e-05, "loss": -0.0018, "num_tokens": 38414300.0, "reward": 0.9578857421875, "reward_std": 0.201925590634346, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.45421501994132996, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 842.0, "completions/max_terminated_length": 842.0, "completions/mean_length": 376.88671875, "completions/mean_terminated_length": 376.88671875, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.11659412138700054, "frac_reward_zero_std": 0.1875, "grad_norm": 0.388369966669549, "kl": 0.58251953125, "learning_rate": 1.9984322807784132e-05, "loss": 0.0539, "num_tokens": 38538591.0, "reward": 0.994873046875, "reward_std": 0.2887622117996216, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 736.0, "completions/max_terminated_length": 736.0, "completions/mean_length": 387.2109375, "completions/mean_terminated_length": 387.2109375, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.11698537682789652, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3320285981068708, "kl": 0.62939453125, "learning_rate": 1.998354895289481e-05, "loss": 0.0105, "num_tokens": 38668693.0, "reward": 0.928466796875, "reward_std": 0.3031202256679535, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4675106406211853, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 804.0, "completions/max_terminated_length": 804.0, "completions/mean_length": 454.984375, "completions/mean_terminated_length": 454.984375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.11737663226879248, "frac_reward_zero_std": 0.125, "grad_norm": 0.3461765667057011, "kl": 0.5908203125, "learning_rate": 1.998275647160409e-05, "loss": 0.0239, "num_tokens": 38816689.0, "reward": 0.80712890625, "reward_std": 0.3424859642982483, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49705013632774353, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 872.0, "completions/max_terminated_length": 872.0, "completions/mean_length": 348.4921875, "completions/mean_terminated_length": 348.4921875, "completions/min_length": 107.0, "completions/min_terminated_length": 107.0, "epoch": 0.11776788770968846, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4561222859452704, "kl": 0.59130859375, "learning_rate": 1.9981945365390517e-05, "loss": 0.0426, "num_tokens": 38939599.0, "reward": 1.002685546875, "reward_std": 0.2723240852355957, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 901.0, "completions/max_terminated_length": 901.0, "completions/mean_length": 355.765625, "completions/mean_terminated_length": 355.765625, "completions/min_length": 80.0, "completions/min_terminated_length": 80.0, "epoch": 0.11815914315058444, "frac_reward_zero_std": 0.5, "grad_norm": 0.3040663019863079, "kl": 0.56396484375, "learning_rate": 1.998111563576738e-05, "loss": 0.0169, "num_tokens": 39059603.0, "reward": 1.030029296875, "reward_std": 0.22204959392547607, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 355.86328125, "completions/mean_terminated_length": 355.86328125, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.11855039859148041, "frac_reward_zero_std": 0.25, "grad_norm": 0.3430766289733513, "kl": 0.514404296875, "learning_rate": 1.9980267284282718e-05, "loss": -0.003, "num_tokens": 39179360.0, "reward": 1.06396484375, "reward_std": 0.25875645875930786, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 422.0703125, "completions/mean_terminated_length": 422.0703125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.11894165403237639, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4558278323300668, "kl": 0.576171875, "learning_rate": 1.9979400312519298e-05, "loss": 0.0259, "num_tokens": 39316306.0, "reward": 0.9713134765625, "reward_std": 0.23361653089523315, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 822.0, "completions/max_terminated_length": 822.0, "completions/mean_length": 331.65625, "completions/mean_terminated_length": 331.65625, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.11933290947327237, "frac_reward_zero_std": 0.3125, "grad_norm": 0.36726987882006795, "kl": 0.59814453125, "learning_rate": 1.997851472209465e-05, "loss": 0.0337, "num_tokens": 39429162.0, "reward": 1.059326171875, "reward_std": 0.229018896818161, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 696.0, "completions/max_terminated_length": 696.0, "completions/mean_length": 309.66015625, "completions/mean_terminated_length": 309.66015625, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.11972416491416833, "frac_reward_zero_std": 0.125, "grad_norm": 0.4095092776241663, "kl": 0.56787109375, "learning_rate": 1.9977610514661018e-05, "loss": 0.0233, "num_tokens": 39536691.0, "reward": 1.0772705078125, "reward_std": 0.17300698161125183, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.9853515625, "rewards/tag_count_reward/std": 0.0628589615225792, "step": 306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 688.0, "completions/max_terminated_length": 688.0, "completions/mean_length": 315.77734375, "completions/mean_terminated_length": 315.77734375, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.12011542035506431, "frac_reward_zero_std": 0.1875, "grad_norm": 0.4491825453766641, "kl": 0.5732421875, "learning_rate": 1.9976687691905394e-05, "loss": 0.0166, "num_tokens": 39646378.0, "reward": 1.1273193359375, "reward_std": 0.22940635681152344, "rewards/accuracy_reward/mean": 0.8828125, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 853.0, "completions/max_terminated_length": 853.0, "completions/mean_length": 337.578125, "completions/mean_terminated_length": 337.578125, "completions/min_length": 88.0, "completions/min_terminated_length": 88.0, "epoch": 0.12050667579596029, "frac_reward_zero_std": 0.25, "grad_norm": 0.3655560757384734, "kl": 0.54638671875, "learning_rate": 1.9975746255549496e-05, "loss": 0.0219, "num_tokens": 39762862.0, "reward": 0.9801025390625, "reward_std": 0.25896766781806946, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 864.0, "completions/max_terminated_length": 864.0, "completions/mean_length": 406.796875, "completions/mean_terminated_length": 406.796875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.12089793123685626, "frac_reward_zero_std": 0.125, "grad_norm": 0.34552876606198085, "kl": 0.494873046875, "learning_rate": 1.9974786207349775e-05, "loss": 0.0235, "num_tokens": 39895642.0, "reward": 1.0152587890625, "reward_std": 0.26819008588790894, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 885.0, "completions/max_terminated_length": 885.0, "completions/mean_length": 307.03125, "completions/mean_terminated_length": 307.03125, "completions/min_length": 82.0, "completions/min_terminated_length": 82.0, "epoch": 0.12128918667775224, "frac_reward_zero_std": 0.3125, "grad_norm": 0.401772383486407, "kl": 0.55908203125, "learning_rate": 1.9973807549097396e-05, "loss": 0.0331, "num_tokens": 40003762.0, "reward": 0.9617919921875, "reward_std": 0.19513608515262604, "rewards/accuracy_reward/mean": 0.71484375, "rewards/accuracy_reward/std": 0.4523732364177704, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 326.44140625, "completions/mean_terminated_length": 326.44140625, "completions/min_length": 55.0, "completions/min_terminated_length": 55.0, "epoch": 0.12168044211864822, "frac_reward_zero_std": 0.25, "grad_norm": 0.5732324779280024, "kl": 0.579833984375, "learning_rate": 1.9972810282618256e-05, "loss": 0.0315, "num_tokens": 40117939.0, "reward": 0.9903564453125, "reward_std": 0.31498581171035767, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 953.0, "completions/max_terminated_length": 953.0, "completions/mean_length": 306.23046875, "completions/mean_terminated_length": 306.23046875, "completions/min_length": 116.0, "completions/min_terminated_length": 116.0, "epoch": 0.12207169755954418, "frac_reward_zero_std": 0.375, "grad_norm": 0.33345820933132553, "kl": 0.478271484375, "learning_rate": 1.9971794409772962e-05, "loss": -0.005, "num_tokens": 40225806.0, "reward": 1.0572509765625, "reward_std": 0.17780093848705292, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 361.12890625, "completions/mean_terminated_length": 361.12890625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.12246295300044016, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3559757628585404, "kl": 0.54833984375, "learning_rate": 1.9970759932456836e-05, "loss": 0.009, "num_tokens": 40348095.0, "reward": 0.9296875, "reward_std": 0.22930431365966797, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4675106406211853, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1116.0, "completions/max_terminated_length": 1116.0, "completions/mean_length": 425.66015625, "completions/mean_terminated_length": 425.66015625, "completions/min_length": 86.0, "completions/min_terminated_length": 86.0, "epoch": 0.12285420844133614, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4897926697141651, "kl": 0.6005859375, "learning_rate": 1.9969706852599915e-05, "loss": 0.0215, "num_tokens": 40486712.0, "reward": 0.89208984375, "reward_std": 0.235600084066391, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1078.0, "completions/max_terminated_length": 1078.0, "completions/mean_length": 402.14453125, "completions/mean_terminated_length": 402.14453125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1232454638822321, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3735306828608242, "kl": 0.55712890625, "learning_rate": 1.996863517216694e-05, "loss": 0.0225, "num_tokens": 40618621.0, "reward": 0.8255615234375, "reward_std": 0.29772743582725525, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1083.0, "completions/max_terminated_length": 1083.0, "completions/mean_length": 413.73828125, "completions/mean_terminated_length": 413.73828125, "completions/min_length": 106.0, "completions/min_terminated_length": 106.0, "epoch": 0.12363671932312809, "frac_reward_zero_std": 0.375, "grad_norm": 0.4392916017734781, "kl": 0.7041015625, "learning_rate": 1.9967544893157352e-05, "loss": 0.0286, "num_tokens": 40754522.0, "reward": 0.936279296875, "reward_std": 0.2340478152036667, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1361.0, "completions/max_terminated_length": 1361.0, "completions/mean_length": 484.515625, "completions/mean_terminated_length": 484.515625, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.12402797476402407, "frac_reward_zero_std": 0.4375, "grad_norm": 0.29320437661932813, "kl": 0.62890625, "learning_rate": 1.9966436017605296e-05, "loss": 0.0593, "num_tokens": 40910046.0, "reward": 0.7532958984375, "reward_std": 0.21721693873405457, "rewards/accuracy_reward/mean": 0.50390625, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 905.0, "completions/max_terminated_length": 905.0, "completions/mean_length": 389.40625, "completions/mean_terminated_length": 389.40625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.12441923020492003, "frac_reward_zero_std": 0.3125, "grad_norm": 0.41785888864247905, "kl": 0.7109375, "learning_rate": 1.9965308547579613e-05, "loss": 0.0204, "num_tokens": 41039526.0, "reward": 0.9356689453125, "reward_std": 0.30525484681129456, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1009.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 434.4609375, "completions/mean_terminated_length": 434.4609375, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.12481048564581601, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3464417343924594, "kl": 0.669921875, "learning_rate": 1.9964162485183837e-05, "loss": 0.0338, "num_tokens": 41180492.0, "reward": 0.8509521484375, "reward_std": 0.29325175285339355, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1149.0, "completions/max_terminated_length": 1149.0, "completions/mean_length": 427.6796875, "completions/mean_terminated_length": 427.6796875, "completions/min_length": 151.0, "completions/min_terminated_length": 151.0, "epoch": 0.12520174108671198, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3664292140127626, "kl": 0.6630859375, "learning_rate": 1.9962997832556182e-05, "loss": 0.0296, "num_tokens": 41321482.0, "reward": 0.9329833984375, "reward_std": 0.26370853185653687, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 851.0, "completions/max_terminated_length": 851.0, "completions/mean_length": 424.625, "completions/mean_terminated_length": 424.625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.12559299652760797, "frac_reward_zero_std": 0.25, "grad_norm": 0.3735732336235536, "kl": 0.70361328125, "learning_rate": 1.9961814591869558e-05, "loss": 0.0339, "num_tokens": 41460090.0, "reward": 0.8262939453125, "reward_std": 0.264379620552063, "rewards/accuracy_reward/mean": 0.578125, "rewards/accuracy_reward/std": 0.49482619762420654, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 358.3828125, "completions/mean_terminated_length": 358.3828125, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.12598425196850394, "frac_reward_zero_std": 0.3125, "grad_norm": 0.4475507695469866, "kl": 0.74365234375, "learning_rate": 1.996061276533154e-05, "loss": 0.033, "num_tokens": 41581916.0, "reward": 0.866943359375, "reward_std": 0.28162315487861633, "rewards/accuracy_reward/mean": 0.6171875, "rewards/accuracy_reward/std": 0.48702529072761536, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 768.0, "completions/max_terminated_length": 768.0, "completions/mean_length": 357.4375, "completions/mean_terminated_length": 357.4375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.1263755074093999, "frac_reward_zero_std": 0.25, "grad_norm": 0.4792842784275836, "kl": 0.765625, "learning_rate": 1.995939235518439e-05, "loss": 0.0055, "num_tokens": 41701932.0, "reward": 0.72412109375, "reward_std": 0.25478848814964294, "rewards/accuracy_reward/mean": 0.4765625, "rewards/accuracy_reward/std": 0.5004287362098694, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 742.0, "completions/max_terminated_length": 742.0, "completions/mean_length": 319.30078125, "completions/mean_terminated_length": 319.30078125, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.1267667628502959, "frac_reward_zero_std": 0.1875, "grad_norm": 1.0127641221024817, "kl": 0.77587890625, "learning_rate": 1.9958153363705044e-05, "loss": 0.0026, "num_tokens": 41813369.0, "reward": 0.81982421875, "reward_std": 0.22450149059295654, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 697.0, "completions/max_terminated_length": 697.0, "completions/mean_length": 323.5625, "completions/mean_terminated_length": 323.5625, "completions/min_length": 119.0, "completions/min_terminated_length": 119.0, "epoch": 0.12715801829119186, "frac_reward_zero_std": 0.375, "grad_norm": 0.37160230028853747, "kl": 0.7900390625, "learning_rate": 1.9956895793205093e-05, "loss": 0.0325, "num_tokens": 41925177.0, "reward": 1.0059814453125, "reward_std": 0.27687737345695496, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 685.0, "completions/max_terminated_length": 685.0, "completions/mean_length": 324.73046875, "completions/mean_terminated_length": 324.73046875, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.12754927373208783, "frac_reward_zero_std": 0.25, "grad_norm": 0.38444170244344067, "kl": 0.7568359375, "learning_rate": 1.99556196460308e-05, "loss": 0.0169, "num_tokens": 42037716.0, "reward": 0.81640625, "reward_std": 0.33270958065986633, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1111.0, "completions/max_terminated_length": 1111.0, "completions/mean_length": 310.7578125, "completions/mean_terminated_length": 310.7578125, "completions/min_length": 24.0, "completions/min_terminated_length": 24.0, "epoch": 0.12794052917298382, "frac_reward_zero_std": 0.25, "grad_norm": 0.4552403909567317, "kl": 0.76953125, "learning_rate": 1.9954324924563088e-05, "loss": 0.0013, "num_tokens": 42146150.0, "reward": 0.95166015625, "reward_std": 0.26907670497894287, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45777595043182373, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.04935242608189583, "step": 327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 786.0, "completions/max_terminated_length": 786.0, "completions/mean_length": 288.59765625, "completions/mean_terminated_length": 288.59765625, "completions/min_length": 112.0, "completions/min_terminated_length": 112.0, "epoch": 0.12833178461387978, "frac_reward_zero_std": 0.3125, "grad_norm": 0.39975470360412674, "kl": 0.78955078125, "learning_rate": 1.995301163121753e-05, "loss": 0.0201, "num_tokens": 42248591.0, "reward": 0.955810546875, "reward_std": 0.25368595123291016, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 377.83203125, "completions/mean_terminated_length": 377.83203125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.12872304005477575, "frac_reward_zero_std": 0.1875, "grad_norm": 0.40348665990855315, "kl": 0.71044921875, "learning_rate": 1.9951679768444346e-05, "loss": 0.0223, "num_tokens": 42374756.0, "reward": 0.9383544921875, "reward_std": 0.28252720832824707, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 750.0, "completions/max_terminated_length": 750.0, "completions/mean_length": 309.01953125, "completions/mean_terminated_length": 309.01953125, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.12911429549567174, "frac_reward_zero_std": 0.1875, "grad_norm": 0.42228906374027236, "kl": 0.74755859375, "learning_rate": 1.995032933872841e-05, "loss": 0.0303, "num_tokens": 42483353.0, "reward": 0.921875, "reward_std": 0.3319331109523773, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 342.8359375, "completions/mean_terminated_length": 342.8359375, "completions/min_length": 104.0, "completions/min_terminated_length": 104.0, "epoch": 0.1295055509365677, "frac_reward_zero_std": 0.125, "grad_norm": 0.3910974643066715, "kl": 0.716796875, "learning_rate": 1.994896034458923e-05, "loss": -0.0023, "num_tokens": 42599119.0, "reward": 0.9375, "reward_std": 0.34223634004592896, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 987.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 387.53125, "completions/mean_terminated_length": 387.53125, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.12989680637746367, "frac_reward_zero_std": 0.375, "grad_norm": 0.3489565280419393, "kl": 0.71630859375, "learning_rate": 1.994757278858095e-05, "loss": 0.0429, "num_tokens": 42728231.0, "reward": 0.951904296875, "reward_std": 0.2644009590148926, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45777595043182373, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 397.11328125, "completions/mean_terminated_length": 397.11328125, "completions/min_length": 90.0, "completions/min_terminated_length": 90.0, "epoch": 0.13028806181835967, "frac_reward_zero_std": 0.3125, "grad_norm": 1.2050549516007454, "kl": 0.6845703125, "learning_rate": 1.9946166673292344e-05, "loss": 0.0375, "num_tokens": 42858756.0, "reward": 0.9251708984375, "reward_std": 0.2680123448371887, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1051.0, "completions/max_terminated_length": 1051.0, "completions/mean_length": 470.90625, "completions/mean_terminated_length": 470.90625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.13067931725925563, "frac_reward_zero_std": 0.1875, "grad_norm": 0.33169503010950935, "kl": 0.642578125, "learning_rate": 1.994474200134682e-05, "loss": 0.0411, "num_tokens": 43010284.0, "reward": 0.88427734375, "reward_std": 0.3257395327091217, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 402.484375, "completions/mean_terminated_length": 402.484375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.1310705727001516, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3369991028213982, "kl": 0.7265625, "learning_rate": 1.99432987754024e-05, "loss": 0.0224, "num_tokens": 43141608.0, "reward": 1.0399169921875, "reward_std": 0.19667404890060425, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1925.0, "completions/mean_length": 615.83203125, "completions/mean_terminated_length": 610.2156982421875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.1314618281410476, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2798013697526391, "kl": 0.61083984375, "learning_rate": 1.9941836998151724e-05, "loss": 0.0281, "num_tokens": 43331629.0, "reward": 0.7677001953125, "reward_std": 0.2521253526210785, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1103.0, "completions/max_terminated_length": 1103.0, "completions/mean_length": 475.71484375, "completions/mean_terminated_length": 475.71484375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.13185308358194356, "frac_reward_zero_std": 0.5, "grad_norm": 0.3011334446099935, "kl": 0.6240234375, "learning_rate": 1.9940356672322037e-05, "loss": -0.0064, "num_tokens": 43483188.0, "reward": 0.93359375, "reward_std": 0.20388564467430115, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 550.4140625, "completions/mean_terminated_length": 544.5411987304688, "completions/min_length": 198.0, "completions/min_terminated_length": 198.0, "epoch": 0.13224433902283952, "frac_reward_zero_std": 0.375, "grad_norm": 0.2926024031461953, "kl": 0.662109375, "learning_rate": 1.9938857800675206e-05, "loss": 0.0017, "num_tokens": 43654654.0, "reward": 1.0306396484375, "reward_std": 0.26226598024368286, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1184.0, "completions/max_terminated_length": 1184.0, "completions/mean_length": 546.3046875, "completions/mean_terminated_length": 546.3046875, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.13263559446373552, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3213521106511354, "kl": 0.58837890625, "learning_rate": 1.993734038600769e-05, "loss": 0.0055, "num_tokens": 43823724.0, "reward": 1.0, "reward_std": 0.3374027907848358, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1848.0, "completions/mean_length": 735.99609375, "completions/mean_terminated_length": 720.4387817382812, "completions/min_length": 288.0, "completions/min_terminated_length": 288.0, "epoch": 0.13302684990463148, "frac_reward_zero_std": 0.0625, "grad_norm": 0.31076801573692314, "kl": 0.5712890625, "learning_rate": 1.993580443115054e-05, "loss": 0.0707, "num_tokens": 44040187.0, "reward": 0.7943115234375, "reward_std": 0.28976693749427795, "rewards/accuracy_reward/mean": 0.546875, "rewards/accuracy_reward/std": 0.4987730085849762, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1862.0, "completions/mean_length": 828.84765625, "completions/mean_terminated_length": 789.5201416015625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.13341810534552745, "frac_reward_zero_std": 0.125, "grad_norm": 0.3230994459625229, "kl": 0.56884765625, "learning_rate": 1.9934249938969396e-05, "loss": 0.0833, "num_tokens": 44281828.0, "reward": 0.712158203125, "reward_std": 0.2963637709617615, "rewards/accuracy_reward/mean": 0.46875, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.048530805855989456, "step": 341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1824.0, "completions/mean_length": 665.390625, "completions/mean_terminated_length": 654.50390625, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.13380936078642344, "frac_reward_zero_std": 0.1875, "grad_norm": 0.30111171200640224, "kl": 0.60888671875, "learning_rate": 1.99326769123645e-05, "loss": 0.0506, "num_tokens": 44481272.0, "reward": 0.783935546875, "reward_std": 0.33233416080474854, "rewards/accuracy_reward/mean": 0.53515625, "rewards/accuracy_reward/std": 0.49973952770233154, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1576.0, "completions/max_terminated_length": 1576.0, "completions/mean_length": 581.9375, "completions/mean_terminated_length": 581.9375, "completions/min_length": 240.0, "completions/min_terminated_length": 240.0, "epoch": 0.1342006162273194, "frac_reward_zero_std": 0.4375, "grad_norm": 0.28759601555903186, "kl": 0.61767578125, "learning_rate": 1.9931085354270658e-05, "loss": 0.0073, "num_tokens": 44660888.0, "reward": 0.94921875, "reward_std": 0.2184043675661087, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9609375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1870.0, "completions/mean_length": 814.68359375, "completions/mean_terminated_length": 790.1155395507812, "completions/min_length": 250.0, "completions/min_terminated_length": 250.0, "epoch": 0.13459187166821537, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2803721522637744, "kl": 0.60791015625, "learning_rate": 1.9929475267657256e-05, "loss": 0.0325, "num_tokens": 44899319.0, "reward": 0.70263671875, "reward_std": 0.3624272644519806, "rewards/accuracy_reward/mean": 0.45703125, "rewards/accuracy_reward/std": 0.4991260766983032, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.048884619027376175, "step": 344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2019.0, "completions/mean_length": 798.18359375, "completions/mean_terminated_length": 747.3779907226562, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.13498312710911137, "frac_reward_zero_std": 0.25, "grad_norm": 0.26953547892632773, "kl": 0.609375, "learning_rate": 1.992784665552824e-05, "loss": 0.0417, "num_tokens": 45133158.0, "reward": 0.9425048828125, "reward_std": 0.3312150239944458, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.05079597979784012, "step": 345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2029.0, "completions/mean_length": 825.05859375, "completions/mean_terminated_length": 805.6468505859375, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.13537438255000733, "frac_reward_zero_std": 0.1875, "grad_norm": 0.28278246723461475, "kl": 0.587890625, "learning_rate": 1.9926199520922135e-05, "loss": 0.0483, "num_tokens": 45373477.0, "reward": 0.94287109375, "reward_std": 0.3074232339859009, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2016.0, "completions/mean_length": 966.53515625, "completions/mean_terminated_length": 903.9710083007812, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1357656379909033, "frac_reward_zero_std": 0.3125, "grad_norm": 0.26584958132402015, "kl": 0.53759765625, "learning_rate": 1.9924533866912017e-05, "loss": 0.0167, "num_tokens": 45650574.0, "reward": 0.757080078125, "reward_std": 0.2850263714790344, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5007347464561462, "rewards/format_reward/mean": 0.9453125, "rewards/format_reward/std": 0.22781464457511902, "rewards/tag_count_reward/mean": 0.986328125, "rewards/tag_count_reward/std": 0.056953661143779755, "step": 347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.90625, "completions/max_length": 2048.0, "completions/max_terminated_length": 2012.0, "completions/mean_length": 930.2578125, "completions/mean_terminated_length": 875.286865234375, "completions/min_length": 306.0, "completions/min_terminated_length": 306.0, "epoch": 0.1361568934317993, "frac_reward_zero_std": 0.0, "grad_norm": 0.32516286356393964, "kl": 0.56591796875, "learning_rate": 1.992284969660551e-05, "loss": 0.0847, "num_tokens": 45917792.0, "reward": 0.8045654296875, "reward_std": 0.3512764871120453, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49705013632774353, "rewards/format_reward/mean": 0.94921875, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.05499519780278206, "step": 348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.859375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2030.0, "completions/mean_length": 870.28125, "completions/mean_terminated_length": 781.2101440429688, "completions/min_length": 243.0, "completions/min_terminated_length": 243.0, "epoch": 0.13654814887269526, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3845231313651499, "kl": 0.5966796875, "learning_rate": 1.9921147013144782e-05, "loss": 0.1192, "num_tokens": 46170904.0, "reward": 0.887451171875, "reward_std": 0.3510875701904297, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.90625, "rewards/format_reward/std": 0.2920515835285187, "rewards/tag_count_reward/mean": 0.974609375, "rewards/tag_count_reward/std": 0.08482880145311356, "step": 349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 801.09765625, "completions/mean_terminated_length": 781.3056030273438, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.13693940431359122, "frac_reward_zero_std": 0.375, "grad_norm": 0.26960491803676984, "kl": 0.59130859375, "learning_rate": 1.991942581970655e-05, "loss": 0.0209, "num_tokens": 46404961.0, "reward": 0.894775390625, "reward_std": 0.2029879093170166, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.07606977969408035, "step": 350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.3515625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1989.0, "completions/mean_length": 1483.5390625, "completions/mean_terminated_length": 1212.728271484375, "completions/min_length": 227.0, "completions/min_terminated_length": 227.0, "epoch": 0.13733065975448722, "frac_reward_zero_std": 0.0, "grad_norm": 0.2744962918268671, "kl": 0.5546875, "learning_rate": 1.9917686119502057e-05, "loss": 0.1533, "num_tokens": 46813691.0, "reward": 0.4766845703125, "reward_std": 0.38591501116752625, "rewards/accuracy_reward/mean": 0.28125, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.66015625, "rewards/format_reward/std": 0.47458380460739136, "rewards/tag_count_reward/mean": 0.9033203125, "rewards/tag_count_reward/std": 0.15074557065963745, "step": 351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2043.0, "completions/mean_length": 977.4375, "completions/mean_terminated_length": 942.9031982421875, "completions/min_length": 271.0, "completions/min_terminated_length": 271.0, "epoch": 0.13772191519538318, "frac_reward_zero_std": 0.0, "grad_norm": 0.31368994356265045, "kl": 0.60546875, "learning_rate": 1.9915927915777085e-05, "loss": 0.0629, "num_tokens": 47093547.0, "reward": 0.74267578125, "reward_std": 0.42928528785705566, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5009794235229492, "rewards/format_reward/mean": 0.953125, "rewards/format_reward/std": 0.21178513765335083, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.05294628441333771, "step": 352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1613.0, "completions/max_terminated_length": 1613.0, "completions/mean_length": 668.47265625, "completions/mean_terminated_length": 668.47265625, "completions/min_length": 219.0, "completions/min_terminated_length": 219.0, "epoch": 0.13811317063627915, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2698012223219043, "kl": 0.64404296875, "learning_rate": 1.9914151211811924e-05, "loss": 0.0001, "num_tokens": 47294788.0, "reward": 0.7716064453125, "reward_std": 0.2971839904785156, "rewards/accuracy_reward/mean": 0.5234375, "rewards/accuracy_reward/std": 0.5004287362098694, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1384.0, "completions/max_terminated_length": 1384.0, "completions/mean_length": 530.359375, "completions/mean_terminated_length": 530.359375, "completions/min_length": 149.0, "completions/min_terminated_length": 149.0, "epoch": 0.13850442607717514, "frac_reward_zero_std": 0.125, "grad_norm": 0.5826276596609472, "kl": 0.71044921875, "learning_rate": 1.9912356010921394e-05, "loss": 0.0375, "num_tokens": 47460720.0, "reward": 0.8150634765625, "reward_std": 0.3488168716430664, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1651.0, "completions/max_terminated_length": 1651.0, "completions/mean_length": 521.00390625, "completions/mean_terminated_length": 521.00390625, "completions/min_length": 277.0, "completions/min_terminated_length": 277.0, "epoch": 0.1388956815180711, "frac_reward_zero_std": 0.0, "grad_norm": 0.39500008509886764, "kl": 0.7548828125, "learning_rate": 1.9910542316454813e-05, "loss": 0.0324, "num_tokens": 47625473.0, "reward": 0.7347412109375, "reward_std": 0.29967302083969116, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.05078125, "rewards/format_reward/std": 0.21998079121112823, "rewards/tag_count_reward/mean": 0.2333984375, "rewards/tag_count_reward/std": 0.24347923696041107, "step": 355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1459.0, "completions/max_terminated_length": 1459.0, "completions/mean_length": 529.49609375, "completions/mean_terminated_length": 529.49609375, "completions/min_length": 122.0, "completions/min_terminated_length": 122.0, "epoch": 0.13928693695896707, "frac_reward_zero_std": 0.0, "grad_norm": 0.38450223956024054, "kl": 0.67431640625, "learning_rate": 1.990871013179601e-05, "loss": 0.0102, "num_tokens": 47791328.0, "reward": 0.7034912109375, "reward_std": 0.2821570634841919, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.4677734375, "rewards/tag_count_reward/std": 0.18965016305446625, "step": 356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1465.0, "completions/max_terminated_length": 1465.0, "completions/mean_length": 569.87109375, "completions/mean_terminated_length": 569.87109375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.13967819239986307, "frac_reward_zero_std": 0.0, "grad_norm": 0.35287919775909776, "kl": 0.65087890625, "learning_rate": 1.9906859460363307e-05, "loss": 0.0241, "num_tokens": 47966831.0, "reward": 0.5726318359375, "reward_std": 0.35878786444664, "rewards/accuracy_reward/mean": 0.484375, "rewards/accuracy_reward/std": 0.5007347464561462, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7060546875, "rewards/tag_count_reward/std": 0.10743197798728943, "step": 357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 485.89453125, "completions/mean_terminated_length": 479.7686462402344, "completions/min_length": 195.0, "completions/min_terminated_length": 195.0, "epoch": 0.14006944784075903, "frac_reward_zero_std": 0.25, "grad_norm": 0.327064135219235, "kl": 0.70458984375, "learning_rate": 1.9904990305609524e-05, "loss": 0.0305, "num_tokens": 48120500.0, "reward": 0.705078125, "reward_std": 0.15363967418670654, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.060633908957242966, "step": 358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 994.0, "completions/max_terminated_length": 994.0, "completions/mean_length": 466.6796875, "completions/mean_terminated_length": 466.6796875, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.140460703281655, "frac_reward_zero_std": 0.25, "grad_norm": 0.3065116591656918, "kl": 0.71142578125, "learning_rate": 1.9903102671021955e-05, "loss": 0.0147, "num_tokens": 48267938.0, "reward": 0.7021484375, "reward_std": 0.25391536951065063, "rewards/accuracy_reward/mean": 0.609375, "rewards/accuracy_reward/std": 0.48884621262550354, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7421875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1290.0, "completions/max_terminated_length": 1290.0, "completions/mean_length": 496.8359375, "completions/mean_terminated_length": 496.8359375, "completions/min_length": 194.0, "completions/min_terminated_length": 194.0, "epoch": 0.140851958722551, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35399855895968696, "kl": 0.677734375, "learning_rate": 1.9901196560122384e-05, "loss": 0.0308, "num_tokens": 48425096.0, "reward": 0.6773681640625, "reward_std": 0.3519188165664673, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.08223269879817963, "step": 360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 497.03125, "completions/mean_terminated_length": 497.03125, "completions/min_length": 136.0, "completions/min_terminated_length": 136.0, "epoch": 0.14124321416344696, "frac_reward_zero_std": 0.0625, "grad_norm": 0.343646026489702, "kl": 0.64208984375, "learning_rate": 1.9899271976467058e-05, "loss": 0.0236, "num_tokens": 48581168.0, "reward": 0.6043701171875, "reward_std": 0.3659793436527252, "rewards/accuracy_reward/mean": 0.51171875, "rewards/accuracy_reward/std": 0.5008418560028076, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1757.0, "completions/max_terminated_length": 1757.0, "completions/mean_length": 489.375, "completions/mean_terminated_length": 489.375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.14163446960434292, "frac_reward_zero_std": 0.125, "grad_norm": 0.3446578195808061, "kl": 0.6767578125, "learning_rate": 1.989732892364668e-05, "loss": 0.0106, "num_tokens": 48741648.0, "reward": 0.7822265625, "reward_std": 0.2422993928194046, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7265625, "rewards/tag_count_reward/std": 0.07629599422216415, "step": 362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1422.0, "completions/max_terminated_length": 1422.0, "completions/mean_length": 558.87109375, "completions/mean_terminated_length": 558.87109375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.14202572504523892, "frac_reward_zero_std": 0.0, "grad_norm": 0.3497205607544578, "kl": 0.64599609375, "learning_rate": 1.989536740528644e-05, "loss": 0.0235, "num_tokens": 48915359.0, "reward": 0.6746826171875, "reward_std": 0.3668157160282135, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7412109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1326.0, "completions/max_terminated_length": 1326.0, "completions/mean_length": 531.62890625, "completions/mean_terminated_length": 531.62890625, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.14241698048613488, "frac_reward_zero_std": 0.0625, "grad_norm": 0.36565439899361185, "kl": 0.68115234375, "learning_rate": 1.989338742504595e-05, "loss": 0.0508, "num_tokens": 49080448.0, "reward": 0.782470703125, "reward_std": 0.23686936497688293, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.07361361384391785, "step": 364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1580.0, "completions/max_terminated_length": 1580.0, "completions/mean_length": 629.640625, "completions/mean_terminated_length": 629.640625, "completions/min_length": 321.0, "completions/min_terminated_length": 321.0, "epoch": 0.14280823592703085, "frac_reward_zero_std": 0.125, "grad_norm": 0.2976939751605174, "kl": 0.640625, "learning_rate": 1.989138898661928e-05, "loss": 0.0092, "num_tokens": 49270932.0, "reward": 0.8619384765625, "reward_std": 0.280696302652359, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7392578125, "rewards/tag_count_reward/std": 0.05079597979784012, "step": 365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1658.0, "completions/mean_length": 611.7734375, "completions/mean_terminated_length": 594.7431030273438, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.14319949136792684, "frac_reward_zero_std": 0.125, "grad_norm": 0.3605197857583257, "kl": 0.673828125, "learning_rate": 1.9889372093734932e-05, "loss": 0.0613, "num_tokens": 49456666.0, "reward": 0.9395751953125, "reward_std": 0.20850180089473724, "rewards/accuracy_reward/mean": 0.84765625, "rewards/accuracy_reward/std": 0.3600577116012573, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7353515625, "rewards/tag_count_reward/std": 0.06664416193962097, "step": 366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2044.0, "completions/mean_length": 746.7578125, "completions/mean_terminated_length": 704.7822265625, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.1435907468088228, "frac_reward_zero_std": 0.3125, "grad_norm": 0.29811063026537493, "kl": 0.642578125, "learning_rate": 1.988733675015585e-05, "loss": 0.0788, "num_tokens": 49677772.0, "reward": 0.8648681640625, "reward_std": 0.15611442923545837, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.0879921019077301, "step": 367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9375, "completions/max_length": 2048.0, "completions/max_terminated_length": 2046.0, "completions/mean_length": 839.91015625, "completions/mean_terminated_length": 800.9395141601562, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1439820022497188, "frac_reward_zero_std": 0.1875, "grad_norm": 0.415134481532119, "kl": 0.64501953125, "learning_rate": 1.988528295967939e-05, "loss": 0.0801, "num_tokens": 49920005.0, "reward": 0.7232666015625, "reward_std": 0.21470382809638977, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7236328125, "rewards/tag_count_reward/std": 0.11520691961050034, "step": 368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.984375, "completions/max_length": 2048.0, "completions/max_terminated_length": 1816.0, "completions/mean_length": 693.921875, "completions/mean_terminated_length": 683.2598266601562, "completions/min_length": 202.0, "completions/min_terminated_length": 202.0, "epoch": 0.14437325769061476, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2683548409196532, "kl": 0.60009765625, "learning_rate": 1.9883210726137324e-05, "loss": 0.0225, "num_tokens": 50129089.0, "reward": 0.8568115234375, "reward_std": 0.29898935556411743, "rewards/accuracy_reward/mean": 0.76953125, "rewards/accuracy_reward/std": 0.4219578504562378, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6982421875, "rewards/tag_count_reward/std": 0.17988786101341248, "step": 369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1920.0, "completions/mean_length": 722.44140625, "completions/mean_terminated_length": 706.7233276367188, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.14476451313151073, "frac_reward_zero_std": 0.0, "grad_norm": 0.2618534358988479, "kl": 0.54833984375, "learning_rate": 1.9881120053395842e-05, "loss": 0.008, "num_tokens": 50345538.0, "reward": 0.7410888671875, "reward_std": 0.33782893419265747, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5849609375, "rewards/tag_count_reward/std": 0.29845747351646423, "step": 370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 1964.0, "completions/mean_length": 623.16015625, "completions/mean_terminated_length": 588.9640502929688, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.14515576857240672, "frac_reward_zero_std": 0.0, "grad_norm": 0.30867277879983857, "kl": 0.61328125, "learning_rate": 1.9879010945355534e-05, "loss": -0.0135, "num_tokens": 50534491.0, "reward": 0.7823486328125, "reward_std": 0.28811296820640564, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.5087890625, "rewards/tag_count_reward/std": 0.3301379680633545, "step": 371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1397.0, "completions/mean_length": 733.08984375, "completions/mean_terminated_length": 727.933349609375, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1455470240133027, "frac_reward_zero_std": 0.0, "grad_norm": 0.275833329224333, "kl": 0.55322265625, "learning_rate": 1.9876883405951378e-05, "loss": 0.0412, "num_tokens": 50750962.0, "reward": 0.7406005859375, "reward_std": 0.3617095351219177, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.6435546875, "rewards/tag_count_reward/std": 0.24683777987957, "step": 372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1770.0, "completions/mean_length": 667.40625, "completions/mean_terminated_length": 661.9921875, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.14593827945419865, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2749399871457409, "kl": 0.55224609375, "learning_rate": 1.987473743915275e-05, "loss": 0.0028, "num_tokens": 50950554.0, "reward": 0.719482421875, "reward_std": 0.31506484746932983, "rewards/accuracy_reward/mean": 0.6328125, "rewards/accuracy_reward/std": 0.48298248648643494, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.693359375, "rewards/tag_count_reward/std": 0.18580731749534607, "step": 373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1760.0, "completions/mean_length": 753.2109375, "completions/mean_terminated_length": 748.1333618164062, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.14632953489509465, "frac_reward_zero_std": 0.0, "grad_norm": 0.2794635076538427, "kl": 0.519775390625, "learning_rate": 1.987257304896339e-05, "loss": 0.0259, "num_tokens": 51173088.0, "reward": 0.77587890625, "reward_std": 0.2893019914627075, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.70703125, "rewards/tag_count_reward/std": 0.1584526151418686, "step": 374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1854.0, "completions/max_terminated_length": 1854.0, "completions/mean_length": 788.25390625, "completions/mean_terminated_length": 788.25390625, "completions/min_length": 303.0, "completions/min_terminated_length": 303.0, "epoch": 0.14672079033599061, "frac_reward_zero_std": 0.0625, "grad_norm": 0.252634432864106, "kl": 0.50146484375, "learning_rate": 1.9870390239421433e-05, "loss": 0.0228, "num_tokens": 51403169.0, "reward": 0.7203369140625, "reward_std": 0.31790053844451904, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.48404383659362793, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7314453125, "rewards/tag_count_reward/std": 0.09598540514707565, "step": 375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1699.0, "completions/max_terminated_length": 1699.0, "completions/mean_length": 799.6796875, "completions/mean_terminated_length": 799.6796875, "completions/min_length": 278.0, "completions/min_terminated_length": 278.0, "epoch": 0.14711204577688658, "frac_reward_zero_std": 0.125, "grad_norm": 0.2573659444573144, "kl": 0.50390625, "learning_rate": 1.9868189014599362e-05, "loss": 0.0334, "num_tokens": 51638447.0, "reward": 0.7484130859375, "reward_std": 0.282761812210083, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07723760604858398, "step": 376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1492.0, "completions/mean_length": 787.890625, "completions/mean_terminated_length": 782.9490356445312, "completions/min_length": 327.0, "completions/min_terminated_length": 327.0, "epoch": 0.14750330121778257, "frac_reward_zero_std": 0.1875, "grad_norm": 0.25167057218324207, "kl": 0.51171875, "learning_rate": 1.9865969378604023e-05, "loss": 0.035, "num_tokens": 51868707.0, "reward": 0.7991943359375, "reward_std": 0.18690307438373566, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7373046875, "rewards/tag_count_reward/std": 0.07723760604858398, "step": 377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 766.984375, "completions/mean_terminated_length": 766.984375, "completions/min_length": 350.0, "completions/min_terminated_length": 350.0, "epoch": 0.14789455665867854, "frac_reward_zero_std": 0.0, "grad_norm": 0.2669921290637966, "kl": 0.50927734375, "learning_rate": 1.986373133557661e-05, "loss": 0.0355, "num_tokens": 52094447.0, "reward": 0.841796875, "reward_std": 0.244765043258667, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.734375, "rewards/tag_count_reward/std": 0.07174300402402878, "step": 378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1644.0, "completions/max_terminated_length": 1644.0, "completions/mean_length": 867.5546875, "completions/mean_terminated_length": 867.5546875, "completions/min_length": 370.0, "completions/min_terminated_length": 370.0, "epoch": 0.1482858120995745, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2505172385305965, "kl": 0.53173828125, "learning_rate": 1.9861474889692666e-05, "loss": 0.0427, "num_tokens": 52346397.0, "reward": 0.790283203125, "reward_std": 0.29790568351745605, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.728515625, "rewards/tag_count_reward/std": 0.1108228787779808, "step": 379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1942.0, "completions/max_terminated_length": 1942.0, "completions/mean_length": 878.17578125, "completions/mean_terminated_length": 878.17578125, "completions/min_length": 273.0, "completions/min_terminated_length": 273.0, "epoch": 0.1486770675404705, "frac_reward_zero_std": 0.0, "grad_norm": 0.2694292607699712, "kl": 0.595703125, "learning_rate": 1.9859200045162056e-05, "loss": 0.0242, "num_tokens": 52599642.0, "reward": 0.79345703125, "reward_std": 0.3011976182460785, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.75390625, "rewards/tag_count_reward/std": 0.10138441622257233, "step": 380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1715.0, "completions/max_terminated_length": 1715.0, "completions/mean_length": 830.91015625, "completions/mean_terminated_length": 830.91015625, "completions/min_length": 296.0, "completions/min_terminated_length": 296.0, "epoch": 0.14906832298136646, "frac_reward_zero_std": 0.0, "grad_norm": 0.26582839039570977, "kl": 0.58935546875, "learning_rate": 1.9856906806228988e-05, "loss": 0.0311, "num_tokens": 52841235.0, "reward": 0.80029296875, "reward_std": 0.3624722361564636, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.74609375, "rewards/tag_count_reward/std": 0.15960851311683655, "step": 381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1557.0, "completions/max_terminated_length": 1557.0, "completions/mean_length": 826.47265625, "completions/mean_terminated_length": 826.47265625, "completions/min_length": 398.0, "completions/min_terminated_length": 398.0, "epoch": 0.14945957842226243, "frac_reward_zero_std": 0.0, "grad_norm": 0.2787031111198824, "kl": 0.5986328125, "learning_rate": 1.9854595177171968e-05, "loss": 0.0085, "num_tokens": 53082604.0, "reward": 0.910400390625, "reward_std": 0.2845161557197571, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.783203125, "rewards/tag_count_reward/std": 0.13780881464481354, "step": 382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1784.0, "completions/max_terminated_length": 1784.0, "completions/mean_length": 770.3671875, "completions/mean_terminated_length": 770.3671875, "completions/min_length": 256.0, "completions/min_terminated_length": 256.0, "epoch": 0.14985083386315842, "frac_reward_zero_std": 0.0, "grad_norm": 0.2820506237728571, "kl": 0.6025390625, "learning_rate": 1.985226516230384e-05, "loss": 0.0133, "num_tokens": 53308506.0, "reward": 0.901123046875, "reward_std": 0.24175775051116943, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.802734375, "rewards/tag_count_reward/std": 0.13880562782287598, "step": 383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1706.0, "completions/mean_length": 869.33203125, "completions/mean_terminated_length": 864.7098388671875, "completions/min_length": 420.0, "completions/min_terminated_length": 420.0, "epoch": 0.1502420893040544, "frac_reward_zero_std": 0.0, "grad_norm": 0.2608708293488967, "kl": 0.57177734375, "learning_rate": 1.9849916765971716e-05, "loss": -0.0043, "num_tokens": 53562127.0, "reward": 0.7406005859375, "reward_std": 0.2773701548576355, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.7998046875, "rewards/tag_count_reward/std": 0.20221799612045288, "step": 384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1638.0, "completions/max_terminated_length": 1638.0, "completions/mean_length": 732.578125, "completions/mean_terminated_length": 732.578125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.15063334474495035, "frac_reward_zero_std": 0.0, "grad_norm": 0.2815941807741102, "kl": 0.61083984375, "learning_rate": 1.984754999255704e-05, "loss": 0.0303, "num_tokens": 53778867.0, "reward": 0.7919921875, "reward_std": 0.3294128179550171, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8046875, "rewards/tag_count_reward/std": 0.2618514597415924, "step": 385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1636.0, "completions/max_terminated_length": 1636.0, "completions/mean_length": 697.6796875, "completions/mean_terminated_length": 697.6796875, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.15102460018584635, "frac_reward_zero_std": 0.0, "grad_norm": 0.2956233265660385, "kl": 0.625, "learning_rate": 1.9845164846475507e-05, "loss": 0.0389, "num_tokens": 53989153.0, "reward": 0.79150390625, "reward_std": 0.3030899167060852, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.80078125, "rewards/tag_count_reward/std": 0.27450379729270935, "step": 386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1131.0, "completions/max_terminated_length": 1131.0, "completions/mean_length": 566.26953125, "completions/mean_terminated_length": 566.26953125, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.1514158556267423, "frac_reward_zero_std": 0.0, "grad_norm": 0.30536807552127115, "kl": 0.62109375, "learning_rate": 1.9842761332177115e-05, "loss": -0.0006, "num_tokens": 54164246.0, "reward": 0.8192138671875, "reward_std": 0.35254448652267456, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8974609375, "rewards/tag_count_reward/std": 0.20166803896427155, "step": 387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1031.0, "completions/max_terminated_length": 1031.0, "completions/mean_length": 548.2109375, "completions/mean_terminated_length": 548.2109375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.15180711106763828, "frac_reward_zero_std": 0.0, "grad_norm": 0.31110499522741997, "kl": 0.58837890625, "learning_rate": 1.9840339454146124e-05, "loss": -0.0018, "num_tokens": 54333964.0, "reward": 0.8060302734375, "reward_std": 0.324663370847702, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8857421875, "rewards/tag_count_reward/std": 0.22444231808185577, "step": 388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 920.0, "completions/max_terminated_length": 920.0, "completions/mean_length": 500.8984375, "completions/mean_terminated_length": 500.8984375, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.15219836650853427, "frac_reward_zero_std": 0.0, "grad_norm": 0.2964438272490916, "kl": 0.6357421875, "learning_rate": 1.9837899216901054e-05, "loss": -0.0072, "num_tokens": 54493362.0, "reward": 0.8009033203125, "reward_std": 0.30982744693756104, "rewards/accuracy_reward/mean": 0.68359375, "rewards/accuracy_reward/std": 0.4659844934940338, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9384765625, "rewards/tag_count_reward/std": 0.19269509613513947, "step": 389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 971.0, "completions/max_terminated_length": 971.0, "completions/mean_length": 482.59375, "completions/mean_terminated_length": 482.59375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.15258962194943024, "frac_reward_zero_std": 0.0, "grad_norm": 0.3423104420613981, "kl": 0.6171875, "learning_rate": 1.9835440624994674e-05, "loss": -0.0036, "num_tokens": 54647306.0, "reward": 0.8809814453125, "reward_std": 0.2989996671676636, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9228515625, "rewards/tag_count_reward/std": 0.20923595130443573, "step": 390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 462.66796875, "completions/mean_terminated_length": 462.66796875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.1529808773903262, "frac_reward_zero_std": 0.0, "grad_norm": 0.35326688872106066, "kl": 0.63525390625, "learning_rate": 1.983296368301401e-05, "loss": -0.0078, "num_tokens": 54796117.0, "reward": 0.8597412109375, "reward_std": 0.2673071026802063, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9091796875, "rewards/tag_count_reward/std": 0.18204641342163086, "step": 391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 860.0, "completions/max_terminated_length": 860.0, "completions/mean_length": 472.41015625, "completions/mean_terminated_length": 472.41015625, "completions/min_length": 224.0, "completions/min_terminated_length": 224.0, "epoch": 0.1533721328312222, "frac_reward_zero_std": 0.0, "grad_norm": 0.3136358740669056, "kl": 0.60546875, "learning_rate": 1.9830468395580306e-05, "loss": 0.007, "num_tokens": 54946350.0, "reward": 0.7890625, "reward_std": 0.36408761143684387, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.90625, "rewards/tag_count_reward/std": 0.14348600804805756, "step": 392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 933.0, "completions/max_terminated_length": 933.0, "completions/mean_length": 535.796875, "completions/mean_terminated_length": 535.796875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.15376338827211816, "frac_reward_zero_std": 0.0, "grad_norm": 0.3062576371244963, "kl": 0.54052734375, "learning_rate": 1.982795476734905e-05, "loss": -0.0007, "num_tokens": 55113882.0, "reward": 0.7515869140625, "reward_std": 0.2820354402065277, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.8876953125, "rewards/tag_count_reward/std": 0.14462582767009735, "step": 393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1268.0, "completions/max_terminated_length": 1268.0, "completions/mean_length": 546.95703125, "completions/mean_terminated_length": 546.95703125, "completions/min_length": 232.0, "completions/min_terminated_length": 232.0, "epoch": 0.15415464371301413, "frac_reward_zero_std": 0.1875, "grad_norm": 0.27467180161384025, "kl": 0.5625, "learning_rate": 1.9825422803009943e-05, "loss": 0.008, "num_tokens": 55282575.0, "reward": 0.872802734375, "reward_std": 0.2723727226257324, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.982421875, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1019.0, "completions/max_terminated_length": 1019.0, "completions/mean_length": 531.109375, "completions/mean_terminated_length": 531.109375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.15454589915391012, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35245027585092664, "kl": 0.6591796875, "learning_rate": 1.982287250728689e-05, "loss": 0.0114, "num_tokens": 55447291.0, "reward": 0.7918701171875, "reward_std": 0.24008126556873322, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9873046875, "rewards/tag_count_reward/std": 0.05499519780278206, "step": 395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1208.0, "completions/max_terminated_length": 1208.0, "completions/mean_length": 556.97265625, "completions/mean_terminated_length": 556.97265625, "completions/min_length": 249.0, "completions/min_terminated_length": 249.0, "epoch": 0.1549371545948061, "frac_reward_zero_std": 0.375, "grad_norm": 0.2384666091101589, "kl": 0.509765625, "learning_rate": 1.9820303884938002e-05, "loss": 0.025, "num_tokens": 55620612.0, "reward": 0.9244384765625, "reward_std": 0.24958039820194244, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.0743061825633049, "step": 396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 999.0, "completions/max_terminated_length": 999.0, "completions/mean_length": 523.27734375, "completions/mean_terminated_length": 523.27734375, "completions/min_length": 246.0, "completions/min_terminated_length": 246.0, "epoch": 0.15532841003570205, "frac_reward_zero_std": 0.125, "grad_norm": 0.3094377190616763, "kl": 0.52880859375, "learning_rate": 1.9817716940755586e-05, "loss": 0.0148, "num_tokens": 55785819.0, "reward": 0.90576171875, "reward_std": 0.30741822719573975, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 585.328125, "completions/mean_terminated_length": 585.328125, "completions/min_length": 264.0, "completions/min_terminated_length": 264.0, "epoch": 0.15571966547659805, "frac_reward_zero_std": 0.25, "grad_norm": 0.24668498834270064, "kl": 0.512451171875, "learning_rate": 1.9815111679566127e-05, "loss": 0.0033, "num_tokens": 55965647.0, "reward": 0.9178466796875, "reward_std": 0.19477546215057373, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.0078125, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1353.0, "completions/max_terminated_length": 1353.0, "completions/mean_length": 551.6796875, "completions/mean_terminated_length": 551.6796875, "completions/min_length": 234.0, "completions/min_terminated_length": 234.0, "epoch": 0.156110920917494, "frac_reward_zero_std": 0.0625, "grad_norm": 0.2861325685057427, "kl": 0.564453125, "learning_rate": 1.9812488106230286e-05, "loss": 0.0085, "num_tokens": 56136445.0, "reward": 0.8489990234375, "reward_std": 0.3136948347091675, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.03125, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.9794921875, "rewards/tag_count_reward/std": 0.07221520692110062, "step": 399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1537.0, "completions/max_terminated_length": 1537.0, "completions/mean_length": 561.8671875, "completions/mean_terminated_length": 561.8671875, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.15650217635838998, "frac_reward_zero_std": 0.0, "grad_norm": 0.2951577372312112, "kl": 0.548828125, "learning_rate": 1.9809846225642886e-05, "loss": 0.0448, "num_tokens": 56309515.0, "reward": 0.7784423828125, "reward_std": 0.263945609331131, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.0625, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.07162948697805405, "step": 400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1529.0, "completions/max_terminated_length": 1529.0, "completions/mean_length": 701.28125, "completions/mean_terminated_length": 701.28125, "completions/min_length": 236.0, "completions/min_terminated_length": 236.0, "epoch": 0.15689343179928597, "frac_reward_zero_std": 0.0, "grad_norm": 0.2778775453763899, "kl": 0.505859375, "learning_rate": 1.9807186042732908e-05, "loss": 0.016, "num_tokens": 56520019.0, "reward": 0.7667236328125, "reward_std": 0.31425023078918457, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.171875, "rewards/format_reward/std": 0.3780108094215393, "rewards/tag_count_reward/mean": 0.9619140625, "rewards/tag_count_reward/std": 0.10739631950855255, "step": 401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1863.0, "completions/max_terminated_length": 1863.0, "completions/mean_length": 653.6640625, "completions/mean_terminated_length": 653.6640625, "completions/min_length": 190.0, "completions/min_terminated_length": 190.0, "epoch": 0.15728468724018194, "frac_reward_zero_std": 0.0, "grad_norm": 0.25250075154979107, "kl": 0.4873046875, "learning_rate": 1.9804507562463483e-05, "loss": 0.0708, "num_tokens": 56715197.0, "reward": 0.9368896484375, "reward_std": 0.23185622692108154, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.1796875, "rewards/format_reward/std": 0.38467901945114136, "rewards/tag_count_reward/mean": 0.9716796875, "rewards/tag_count_reward/std": 0.0824187695980072, "step": 402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1509.0, "completions/max_terminated_length": 1509.0, "completions/mean_length": 594.6640625, "completions/mean_terminated_length": 594.6640625, "completions/min_length": 185.0, "completions/min_terminated_length": 185.0, "epoch": 0.1576759426810779, "frac_reward_zero_std": 0.0, "grad_norm": 0.2977774647814966, "kl": 0.53125, "learning_rate": 1.9801810789831875e-05, "loss": 0.0739, "num_tokens": 56896535.0, "reward": 0.9415283203125, "reward_std": 0.2561619281768799, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.36328125, "rewards/format_reward/std": 0.48188701272010803, "rewards/tag_count_reward/mean": 0.9501953125, "rewards/tag_count_reward/std": 0.10941001027822495, "step": 403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1922.0, "completions/mean_length": 570.171875, "completions/mean_terminated_length": 564.3765258789062, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.1580671981219739, "frac_reward_zero_std": 0.0, "grad_norm": 0.3229183731196954, "kl": 0.58203125, "learning_rate": 1.979909572986948e-05, "loss": 0.103, "num_tokens": 57071475.0, "reward": 1.0457763671875, "reward_std": 0.17822763323783875, "rewards/accuracy_reward/mean": 0.86328125, "rewards/accuracy_reward/std": 0.34422317147254944, "rewards/format_reward/mean": 0.51171875, "rewards/format_reward/std": 0.5008418560028076, "rewards/tag_count_reward/mean": 0.9482421875, "rewards/tag_count_reward/std": 0.10387981683015823, "step": 404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1546.0, "completions/max_terminated_length": 1546.0, "completions/mean_length": 699.76171875, "completions/mean_terminated_length": 699.76171875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.15845845356286986, "frac_reward_zero_std": 0.0, "grad_norm": 0.2599753702069911, "kl": 0.514404296875, "learning_rate": 1.9796362387641808e-05, "loss": 0.037, "num_tokens": 57279622.0, "reward": 0.861572265625, "reward_std": 0.3503393530845642, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.69921875, "rewards/format_reward/std": 0.45949608087539673, "rewards/tag_count_reward/mean": 0.912109375, "rewards/tag_count_reward/std": 0.14207787811756134, "step": 405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 632.796875, "completions/mean_terminated_length": 632.796875, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.15884970900376583, "frac_reward_zero_std": 0.0, "grad_norm": 0.2941296728924698, "kl": 0.53857421875, "learning_rate": 1.9793610768248482e-05, "loss": 0.0628, "num_tokens": 57470210.0, "reward": 0.9359130859375, "reward_std": 0.3296355605125427, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.80859375, "rewards/format_reward/std": 0.39417871832847595, "rewards/tag_count_reward/mean": 0.9287109375, "rewards/tag_count_reward/std": 0.11941811442375183, "step": 406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1823.0, "completions/max_terminated_length": 1823.0, "completions/mean_length": 644.01953125, "completions/mean_terminated_length": 644.01953125, "completions/min_length": 183.0, "completions/min_terminated_length": 183.0, "epoch": 0.15924096444466182, "frac_reward_zero_std": 0.0, "grad_norm": 0.2780145466833637, "kl": 0.548828125, "learning_rate": 1.979084087682323e-05, "loss": 0.0321, "num_tokens": 57663559.0, "reward": 0.8109130859375, "reward_std": 0.24667400121688843, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.8984375, "rewards/format_reward/std": 0.3026638329029083, "rewards/tag_count_reward/mean": 0.9013671875, "rewards/tag_count_reward/std": 0.1357191652059555, "step": 407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1621.0, "completions/max_terminated_length": 1621.0, "completions/mean_length": 513.6953125, "completions/mean_terminated_length": 513.6953125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.15963221988555779, "frac_reward_zero_std": 0.0, "grad_norm": 0.34207326413469535, "kl": 0.62109375, "learning_rate": 1.9788052718533858e-05, "loss": 0.0753, "num_tokens": 57824345.0, "reward": 1.0294189453125, "reward_std": 0.2353833019733429, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.8955078125, "rewards/tag_count_reward/std": 0.133097842335701, "step": 408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1252.0, "completions/max_terminated_length": 1252.0, "completions/mean_length": 545.109375, "completions/mean_terminated_length": 545.109375, "completions/min_length": 92.0, "completions/min_terminated_length": 92.0, "epoch": 0.16002347532645375, "frac_reward_zero_std": 0.0, "grad_norm": 0.3190860747083147, "kl": 0.62451171875, "learning_rate": 1.9785246298582262e-05, "loss": 0.0111, "num_tokens": 57994133.0, "reward": 1.0103759765625, "reward_std": 0.3693007826805115, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.9375, "rewards/format_reward/std": 0.24253563582897186, "rewards/tag_count_reward/mean": 0.8955078125, "rewards/tag_count_reward/std": 0.13851217925548553, "step": 409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1522.0, "completions/max_terminated_length": 1522.0, "completions/mean_length": 523.96875, "completions/mean_terminated_length": 523.96875, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.16041473076734974, "frac_reward_zero_std": 0.0, "grad_norm": 0.30179299216032945, "kl": 0.5322265625, "learning_rate": 1.9782421622204402e-05, "loss": 0.0144, "num_tokens": 58158397.0, "reward": 1.045166015625, "reward_std": 0.30682340264320374, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.9609375, "rewards/format_reward/std": 0.19412322342395782, "rewards/tag_count_reward/mean": 0.962890625, "rewards/tag_count_reward/std": 0.08905739337205887, "step": 410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 884.0, "completions/max_terminated_length": 884.0, "completions/mean_length": 469.01171875, "completions/mean_terminated_length": 469.01171875, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.1608059862082457, "frac_reward_zero_std": 0.125, "grad_norm": 0.29880581508234083, "kl": 0.611328125, "learning_rate": 1.977957869467031e-05, "loss": 0.0064, "num_tokens": 58308304.0, "reward": 0.9818115234375, "reward_std": 0.28672999143600464, "rewards/accuracy_reward/mean": 0.73828125, "rewards/accuracy_reward/std": 0.4404313564300537, "rewards/format_reward/mean": 0.95703125, "rewards/format_reward/std": 0.20318391919136047, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1297.0, "completions/max_terminated_length": 1297.0, "completions/mean_length": 551.91015625, "completions/mean_terminated_length": 551.91015625, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.16119724164914168, "frac_reward_zero_std": 0.125, "grad_norm": 0.29815891932795785, "kl": 0.53564453125, "learning_rate": 1.977671752128406e-05, "loss": 0.0134, "num_tokens": 58479801.0, "reward": 1.0333251953125, "reward_std": 0.2802361845970154, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.0554114393889904, "step": 412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1240.0, "completions/max_terminated_length": 1240.0, "completions/mean_length": 548.73046875, "completions/mean_terminated_length": 548.73046875, "completions/min_length": 171.0, "completions/min_terminated_length": 171.0, "epoch": 0.16158849709003767, "frac_reward_zero_std": 0.125, "grad_norm": 0.354372431173064, "kl": 0.56787109375, "learning_rate": 1.977383810738377e-05, "loss": 0.0212, "num_tokens": 58649300.0, "reward": 0.9654541015625, "reward_std": 0.292558491230011, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 458.953125, "completions/mean_terminated_length": 458.953125, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.16197975253093364, "frac_reward_zero_std": 0.3125, "grad_norm": 0.30850747531826256, "kl": 0.572265625, "learning_rate": 1.9770940458341583e-05, "loss": 0.0247, "num_tokens": 58794760.0, "reward": 1.0458984375, "reward_std": 0.2201652228832245, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 457.55078125, "completions/mean_terminated_length": 457.55078125, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.1623710079718296, "frac_reward_zero_std": 0.375, "grad_norm": 0.2807732409034864, "kl": 0.564453125, "learning_rate": 1.976802457956368e-05, "loss": -0.0027, "num_tokens": 58942693.0, "reward": 1.0101318359375, "reward_std": 0.23781812191009521, "rewards/accuracy_reward/mean": 0.76171875, "rewards/accuracy_reward/std": 0.4268665909767151, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1709.0, "completions/max_terminated_length": 1709.0, "completions/mean_length": 567.55078125, "completions/mean_terminated_length": 567.55078125, "completions/min_length": 134.0, "completions/min_terminated_length": 134.0, "epoch": 0.1627622634127256, "frac_reward_zero_std": 0.25, "grad_norm": 0.2669562343604311, "kl": 0.4873046875, "learning_rate": 1.976509047649024e-05, "loss": 0.0068, "num_tokens": 59118386.0, "reward": 1.0224609375, "reward_std": 0.28612375259399414, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1221.0, "completions/max_terminated_length": 1221.0, "completions/mean_length": 610.27734375, "completions/mean_terminated_length": 610.27734375, "completions/min_length": 196.0, "completions/min_terminated_length": 196.0, "epoch": 0.16315351885362156, "frac_reward_zero_std": 0.375, "grad_norm": 0.23240112577330813, "kl": 0.509033203125, "learning_rate": 1.9762138154595448e-05, "loss": 0.0145, "num_tokens": 59305449.0, "reward": 0.8466796875, "reward_std": 0.23152203857898712, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 436.859375, "completions/mean_terminated_length": 436.859375, "completions/min_length": 139.0, "completions/min_terminated_length": 139.0, "epoch": 0.16354477429451753, "frac_reward_zero_std": 0.3125, "grad_norm": 0.31673465023677166, "kl": 0.65966796875, "learning_rate": 1.9759167619387474e-05, "loss": 0.0273, "num_tokens": 59444789.0, "reward": 1.13818359375, "reward_std": 0.1779569685459137, "rewards/accuracy_reward/mean": 0.890625, "rewards/accuracy_reward/std": 0.31272050738334656, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1308.0, "completions/max_terminated_length": 1308.0, "completions/mean_length": 563.32421875, "completions/mean_terminated_length": 563.32421875, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.16393602973541352, "frac_reward_zero_std": 0.25, "grad_norm": 0.2807885382214668, "kl": 0.56005859375, "learning_rate": 1.975617887640848e-05, "loss": 0.0237, "num_tokens": 59619032.0, "reward": 0.8502197265625, "reward_std": 0.3128945827484131, "rewards/accuracy_reward/mean": 0.6015625, "rewards/accuracy_reward/std": 0.4905354380607605, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1316.0, "completions/max_terminated_length": 1316.0, "completions/mean_length": 518.85546875, "completions/mean_terminated_length": 518.85546875, "completions/min_length": 210.0, "completions/min_terminated_length": 210.0, "epoch": 0.16432728517630948, "frac_reward_zero_std": 0.125, "grad_norm": 0.30274598276704, "kl": 0.60693359375, "learning_rate": 1.9753171931234588e-05, "loss": 0.0021, "num_tokens": 59781763.0, "reward": 0.8297119140625, "reward_std": 0.35487979650497437, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 467.98046875, "completions/mean_terminated_length": 467.98046875, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.16471854061720545, "frac_reward_zero_std": 0.0625, "grad_norm": 0.34353442569744524, "kl": 0.6025390625, "learning_rate": 1.9750146789475885e-05, "loss": 0.0129, "num_tokens": 59931230.0, "reward": 0.967041015625, "reward_std": 0.3892444670200348, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1011.0, "completions/max_terminated_length": 1011.0, "completions/mean_length": 456.3203125, "completions/mean_terminated_length": 456.3203125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.16510979605810144, "frac_reward_zero_std": 0.25, "grad_norm": 0.33756824410272923, "kl": 0.62451171875, "learning_rate": 1.9747103456776406e-05, "loss": 0.043, "num_tokens": 60077872.0, "reward": 0.999755859375, "reward_std": 0.25827324390411377, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04915804788470268, "step": 422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1074.0, "completions/max_terminated_length": 1074.0, "completions/mean_length": 497.94140625, "completions/mean_terminated_length": 497.94140625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.1655010514989974, "frac_reward_zero_std": 0.125, "grad_norm": 0.32533828583343277, "kl": 0.6142578125, "learning_rate": 1.9744041938814125e-05, "loss": 0.0251, "num_tokens": 60234753.0, "reward": 0.815185546875, "reward_std": 0.25465595722198486, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 763.0, "completions/max_terminated_length": 763.0, "completions/mean_length": 398.7109375, "completions/mean_terminated_length": 398.7109375, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.16589230693989337, "frac_reward_zero_std": 0.125, "grad_norm": 0.3908841966266483, "kl": 0.6279296875, "learning_rate": 1.974096224130095e-05, "loss": 0.0423, "num_tokens": 60365527.0, "reward": 1.03564453125, "reward_std": 0.28490424156188965, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 950.0, "completions/max_terminated_length": 950.0, "completions/mean_length": 411.2421875, "completions/mean_terminated_length": 411.2421875, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.16628356238078937, "frac_reward_zero_std": 0.1875, "grad_norm": 0.33110550200739025, "kl": 0.6298828125, "learning_rate": 1.9737864369982695e-05, "loss": 0.011, "num_tokens": 60499317.0, "reward": 0.889404296875, "reward_std": 0.34133338928222656, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 943.0, "completions/max_terminated_length": 943.0, "completions/mean_length": 403.234375, "completions/mean_terminated_length": 403.234375, "completions/min_length": 142.0, "completions/min_terminated_length": 142.0, "epoch": 0.16667481782168533, "frac_reward_zero_std": 0.25, "grad_norm": 0.33817523966685803, "kl": 0.61572265625, "learning_rate": 1.9734748330639088e-05, "loss": 0.0211, "num_tokens": 60632417.0, "reward": 0.9508056640625, "reward_std": 0.2856108248233795, "rewards/accuracy_reward/mean": 0.703125, "rewards/accuracy_reward/std": 0.45777595043182373, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 883.0, "completions/max_terminated_length": 883.0, "completions/mean_length": 401.0, "completions/mean_terminated_length": 401.0, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.1670660732625813, "frac_reward_zero_std": 0.1875, "grad_norm": 0.32034594005032835, "kl": 0.64892578125, "learning_rate": 1.9731614129083756e-05, "loss": 0.0102, "num_tokens": 60764801.0, "reward": 0.899169921875, "reward_std": 0.2933042347431183, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1036.0, "completions/max_terminated_length": 1036.0, "completions/mean_length": 471.50390625, "completions/mean_terminated_length": 471.50390625, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.1674573287034773, "frac_reward_zero_std": 0.0625, "grad_norm": 0.35077554684361223, "kl": 0.609375, "learning_rate": 1.972846177116421e-05, "loss": 0.0288, "num_tokens": 60915714.0, "reward": 0.74560546875, "reward_std": 0.3475818932056427, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1204.0, "completions/max_terminated_length": 1204.0, "completions/mean_length": 394.61328125, "completions/mean_terminated_length": 394.61328125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.16784858414437326, "frac_reward_zero_std": 0.5, "grad_norm": 0.301760219552403, "kl": 0.63671875, "learning_rate": 1.972529126276183e-05, "loss": 0.0042, "num_tokens": 61046607.0, "reward": 0.8343505859375, "reward_std": 0.1899050772190094, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 698.0, "completions/max_terminated_length": 698.0, "completions/mean_length": 370.7734375, "completions/mean_terminated_length": 370.7734375, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.16823983958526922, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3643098026270763, "kl": 0.65283203125, "learning_rate": 1.972210260979186e-05, "loss": 0.0409, "num_tokens": 61170261.0, "reward": 0.91357421875, "reward_std": 0.3017731010913849, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1332.0, "completions/max_terminated_length": 1332.0, "completions/mean_length": 394.72265625, "completions/mean_terminated_length": 394.72265625, "completions/min_length": 95.0, "completions/min_terminated_length": 95.0, "epoch": 0.16863109502616522, "frac_reward_zero_std": 0.5, "grad_norm": 0.29075108836587477, "kl": 0.66650390625, "learning_rate": 1.9718895818203413e-05, "loss": 0.0102, "num_tokens": 61300046.0, "reward": 1.05224609375, "reward_std": 0.2072772979736328, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1015.0, "completions/max_terminated_length": 1015.0, "completions/mean_length": 368.1484375, "completions/mean_terminated_length": 368.1484375, "completions/min_length": 102.0, "completions/min_terminated_length": 102.0, "epoch": 0.16902235046706118, "frac_reward_zero_std": 0.4375, "grad_norm": 0.30449630476343176, "kl": 0.68994140625, "learning_rate": 1.9715670893979416e-05, "loss": 0.0222, "num_tokens": 61423940.0, "reward": 1.1312255859375, "reward_std": 0.13101503252983093, "rewards/accuracy_reward/mean": 0.8828125, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1052.0, "completions/max_terminated_length": 1052.0, "completions/mean_length": 486.671875, "completions/mean_terminated_length": 486.671875, "completions/min_length": 177.0, "completions/min_terminated_length": 177.0, "epoch": 0.16941360590795715, "frac_reward_zero_std": 0.375, "grad_norm": 0.26300370062098294, "kl": 0.5888671875, "learning_rate": 1.971242784313665e-05, "loss": 0.0255, "num_tokens": 61580416.0, "reward": 0.888916015625, "reward_std": 0.21158866584300995, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 874.0, "completions/max_terminated_length": 874.0, "completions/mean_length": 471.5234375, "completions/mean_terminated_length": 471.5234375, "completions/min_length": 117.0, "completions/min_terminated_length": 117.0, "epoch": 0.16980486134885314, "frac_reward_zero_std": 0.375, "grad_norm": 0.25983691555186067, "kl": 0.54443359375, "learning_rate": 1.97091666717257e-05, "loss": 0.0214, "num_tokens": 61732134.0, "reward": 0.984375, "reward_std": 0.2846308946609497, "rewards/accuracy_reward/mean": 0.734375, "rewards/accuracy_reward/std": 0.4425306022167206, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1055.0, "completions/max_terminated_length": 1055.0, "completions/mean_length": 415.12109375, "completions/mean_terminated_length": 415.12109375, "completions/min_length": 128.0, "completions/min_terminated_length": 128.0, "epoch": 0.1701961167897491, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2732419440712056, "kl": 0.62890625, "learning_rate": 1.970588738583097e-05, "loss": 0.0065, "num_tokens": 61866389.0, "reward": 1.04296875, "reward_std": 0.19454212486743927, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 941.0, "completions/max_terminated_length": 941.0, "completions/mean_length": 467.40625, "completions/mean_terminated_length": 467.40625, "completions/min_length": 125.0, "completions/min_terminated_length": 125.0, "epoch": 0.17058737223064507, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3203893191470893, "kl": 0.59375, "learning_rate": 1.9702589991570646e-05, "loss": 0.026, "num_tokens": 62015133.0, "reward": 1.0423583984375, "reward_std": 0.2543026804924011, "rewards/accuracy_reward/mean": 0.79296875, "rewards/accuracy_reward/std": 0.40597182512283325, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1349.0, "completions/max_terminated_length": 1349.0, "completions/mean_length": 499.125, "completions/mean_terminated_length": 499.125, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.17097862767154107, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3306473394622864, "kl": 0.58251953125, "learning_rate": 1.9699274495096712e-05, "loss": 0.0262, "num_tokens": 62174157.0, "reward": 0.9700927734375, "reward_std": 0.30624955892562866, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1033.0, "completions/max_terminated_length": 1033.0, "completions/mean_length": 502.15234375, "completions/mean_terminated_length": 502.15234375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.17136988311243703, "frac_reward_zero_std": 0.25, "grad_norm": 0.32163375505094516, "kl": 0.580078125, "learning_rate": 1.9695940902594926e-05, "loss": 0.0135, "num_tokens": 62332276.0, "reward": 1.006591796875, "reward_std": 0.2994994521141052, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1018.0, "completions/max_terminated_length": 1018.0, "completions/mean_length": 529.91015625, "completions/mean_terminated_length": 529.91015625, "completions/min_length": 158.0, "completions/min_terminated_length": 158.0, "epoch": 0.171761138553333, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2379299835833788, "kl": 0.529052734375, "learning_rate": 1.9692589220284797e-05, "loss": 0.0131, "num_tokens": 62497677.0, "reward": 0.97412109375, "reward_std": 0.17456504702568054, "rewards/accuracy_reward/mean": 0.7265625, "rewards/accuracy_reward/std": 0.446596622467041, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1118.0, "completions/max_terminated_length": 1118.0, "completions/mean_length": 496.578125, "completions/mean_terminated_length": 496.578125, "completions/min_length": 226.0, "completions/min_terminated_length": 226.0, "epoch": 0.172152393994229, "frac_reward_zero_std": 0.25, "grad_norm": 0.28957030542706075, "kl": 0.57470703125, "learning_rate": 1.968921945441959e-05, "loss": 0.0082, "num_tokens": 62655313.0, "reward": 1.0267333984375, "reward_std": 0.27865704894065857, "rewards/accuracy_reward/mean": 0.77734375, "rewards/accuracy_reward/std": 0.41684433817863464, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1269.0, "completions/max_terminated_length": 1269.0, "completions/mean_length": 535.8828125, "completions/mean_terminated_length": 535.8828125, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.17254364943512496, "frac_reward_zero_std": 0.25, "grad_norm": 0.3732463454769088, "kl": 0.564208984375, "learning_rate": 1.9685831611286312e-05, "loss": 0.0175, "num_tokens": 62820947.0, "reward": 0.99853515625, "reward_std": 0.23339399695396423, "rewards/accuracy_reward/mean": 0.75, "rewards/accuracy_reward/std": 0.4338609278202057, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03814799711108208, "step": 441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1141.0, "completions/max_terminated_length": 1141.0, "completions/mean_length": 457.49609375, "completions/mean_terminated_length": 457.49609375, "completions/min_length": 193.0, "completions/min_terminated_length": 193.0, "epoch": 0.17293490487602092, "frac_reward_zero_std": 0.5, "grad_norm": 0.291670847906057, "kl": 0.60009765625, "learning_rate": 1.9682425697205695e-05, "loss": 0.0212, "num_tokens": 62966034.0, "reward": 1.08349609375, "reward_std": 0.20838339626789093, "rewards/accuracy_reward/mean": 0.8359375, "rewards/accuracy_reward/std": 0.3710577189922333, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 913.0, "completions/max_terminated_length": 913.0, "completions/mean_length": 515.40234375, "completions/mean_terminated_length": 515.40234375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.17332616031691692, "frac_reward_zero_std": 0.3125, "grad_norm": 0.284470493294945, "kl": 0.5869140625, "learning_rate": 1.967900171853218e-05, "loss": 0.0056, "num_tokens": 63127977.0, "reward": 1.0299072265625, "reward_std": 0.21935853362083435, "rewards/accuracy_reward/mean": 0.78125, "rewards/accuracy_reward/std": 0.41420844197273254, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1264.0, "completions/mean_length": 609.95703125, "completions/mean_terminated_length": 604.3176879882812, "completions/min_length": 247.0, "completions/min_terminated_length": 247.0, "epoch": 0.17371741575781288, "frac_reward_zero_std": 0.25, "grad_norm": 0.2630305543176937, "kl": 0.5244140625, "learning_rate": 1.9675559681653918e-05, "loss": 0.0236, "num_tokens": 63313182.0, "reward": 0.90185546875, "reward_std": 0.28577500581741333, "rewards/accuracy_reward/mean": 0.65234375, "rewards/accuracy_reward/std": 0.4771590530872345, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 516.63671875, "completions/mean_terminated_length": 516.63671875, "completions/min_length": 223.0, "completions/min_terminated_length": 223.0, "epoch": 0.17410867119870885, "frac_reward_zero_std": 0.4375, "grad_norm": 0.26235820155051987, "kl": 0.55419921875, "learning_rate": 1.9672099592992752e-05, "loss": 0.019, "num_tokens": 63474833.0, "reward": 0.944091796875, "reward_std": 0.23735637962818146, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 986.0, "completions/max_terminated_length": 986.0, "completions/mean_length": 582.46484375, "completions/mean_terminated_length": 582.46484375, "completions/min_length": 244.0, "completions/min_terminated_length": 244.0, "epoch": 0.17449992663960484, "frac_reward_zero_std": 0.375, "grad_norm": 0.24333801963020646, "kl": 0.56494140625, "learning_rate": 1.9668621459004197e-05, "loss": 0.0206, "num_tokens": 63651896.0, "reward": 1.01513671875, "reward_std": 0.2305922508239746, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1213.0, "completions/max_terminated_length": 1213.0, "completions/mean_length": 592.05859375, "completions/mean_terminated_length": 592.05859375, "completions/min_length": 212.0, "completions/min_terminated_length": 212.0, "epoch": 0.1748911820805008, "frac_reward_zero_std": 0.375, "grad_norm": 0.24713029816138884, "kl": 0.544921875, "learning_rate": 1.9665125286177448e-05, "loss": 0.017, "num_tokens": 63832503.0, "reward": 0.97216796875, "reward_std": 0.2684975266456604, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1295.0, "completions/max_terminated_length": 1295.0, "completions/mean_length": 642.30859375, "completions/mean_terminated_length": 642.30859375, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.17528243752139677, "frac_reward_zero_std": 0.25, "grad_norm": 0.26300848794446147, "kl": 0.544921875, "learning_rate": 1.9661611081035342e-05, "loss": 0.0388, "num_tokens": 64026678.0, "reward": 0.9681396484375, "reward_std": 0.31218603253364563, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1105.0, "completions/max_terminated_length": 1105.0, "completions/mean_length": 580.0390625, "completions/mean_terminated_length": 580.0390625, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.17567369296229277, "frac_reward_zero_std": 0.3125, "grad_norm": 0.24803388470581586, "kl": 0.524658203125, "learning_rate": 1.965807885013437e-05, "loss": 0.0212, "num_tokens": 64204864.0, "reward": 0.8934326171875, "reward_std": 0.27901211380958557, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1161.0, "completions/mean_length": 575.51171875, "completions/mean_terminated_length": 569.7373046875, "completions/min_length": 169.0, "completions/min_terminated_length": 169.0, "epoch": 0.17606494840318873, "frac_reward_zero_std": 0.125, "grad_norm": 0.2894196641304778, "kl": 0.56640625, "learning_rate": 1.9654528600064638e-05, "loss": 0.0432, "num_tokens": 64380419.0, "reward": 0.8880615234375, "reward_std": 0.3204813599586487, "rewards/accuracy_reward/mean": 0.640625, "rewards/accuracy_reward/std": 0.4807571768760681, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04671131819486618, "step": 450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1046.0, "completions/max_terminated_length": 1046.0, "completions/mean_length": 569.5, "completions/mean_terminated_length": 569.5, "completions/min_length": 255.0, "completions/min_terminated_length": 255.0, "epoch": 0.1764562038440847, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2721469096090824, "kl": 0.52685546875, "learning_rate": 1.9650960337449892e-05, "loss": 0.0104, "num_tokens": 64558099.0, "reward": 0.7752685546875, "reward_std": 0.27455002069473267, "rewards/accuracy_reward/mean": 0.52734375, "rewards/accuracy_reward/std": 0.5002297759056091, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1029.0, "completions/max_terminated_length": 1029.0, "completions/mean_length": 553.64453125, "completions/mean_terminated_length": 553.64453125, "completions/min_length": 258.0, "completions/min_terminated_length": 258.0, "epoch": 0.1768474592849807, "frac_reward_zero_std": 0.375, "grad_norm": 0.2589489657944005, "kl": 0.57568359375, "learning_rate": 1.964737406894747e-05, "loss": 0.0282, "num_tokens": 64728392.0, "reward": 0.9124755859375, "reward_std": 0.21796870231628418, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1166.0, "completions/max_terminated_length": 1166.0, "completions/mean_length": 564.46875, "completions/mean_terminated_length": 564.46875, "completions/min_length": 157.0, "completions/min_terminated_length": 157.0, "epoch": 0.17723871472587666, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2607003156941981, "kl": 0.5732421875, "learning_rate": 1.96437698012483e-05, "loss": 0.0019, "num_tokens": 64901792.0, "reward": 0.8984375, "reward_std": 0.23283424973487854, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1229.0, "completions/max_terminated_length": 1229.0, "completions/mean_length": 542.0546875, "completions/mean_terminated_length": 542.0546875, "completions/min_length": 176.0, "completions/min_terminated_length": 176.0, "epoch": 0.17762997016677262, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2559841888297123, "kl": 0.556640625, "learning_rate": 1.964014754107691e-05, "loss": 0.0103, "num_tokens": 65070830.0, "reward": 1.057373046875, "reward_std": 0.2272672951221466, "rewards/accuracy_reward/mean": 0.80859375, "rewards/accuracy_reward/std": 0.39417871832847595, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 925.0, "completions/max_terminated_length": 925.0, "completions/mean_length": 492.015625, "completions/mean_terminated_length": 492.015625, "completions/min_length": 172.0, "completions/min_terminated_length": 172.0, "epoch": 0.17802122560766862, "frac_reward_zero_std": 0.25, "grad_norm": 0.29319246439955604, "kl": 0.57470703125, "learning_rate": 1.9636507295191375e-05, "loss": 0.0172, "num_tokens": 65227410.0, "reward": 0.955810546875, "reward_std": 0.286193311214447, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1140.0, "completions/max_terminated_length": 1140.0, "completions/mean_length": 506.58984375, "completions/mean_terminated_length": 506.58984375, "completions/min_length": 208.0, "completions/min_terminated_length": 208.0, "epoch": 0.17841248104856458, "frac_reward_zero_std": 0.25, "grad_norm": 0.30775621904276634, "kl": 0.541015625, "learning_rate": 1.9632849070383342e-05, "loss": 0.0213, "num_tokens": 65387209.0, "reward": 0.9056396484375, "reward_std": 0.31879138946533203, "rewards/accuracy_reward/mean": 0.65625, "rewards/accuracy_reward/std": 0.47588926553726196, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 902.0, "completions/max_terminated_length": 902.0, "completions/mean_length": 445.875, "completions/mean_terminated_length": 445.875, "completions/min_length": 215.0, "completions/min_terminated_length": 215.0, "epoch": 0.17880373648946055, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3248460322115912, "kl": 0.6064453125, "learning_rate": 1.9629172873477995e-05, "loss": 0.0112, "num_tokens": 65530793.0, "reward": 1.0140380859375, "reward_std": 0.2419605553150177, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 489.16796875, "completions/mean_terminated_length": 489.16796875, "completions/min_length": 228.0, "completions/min_terminated_length": 228.0, "epoch": 0.17919499193035654, "frac_reward_zero_std": 0.125, "grad_norm": 0.2915283659113366, "kl": 0.59765625, "learning_rate": 1.9625478711334044e-05, "loss": 0.0288, "num_tokens": 65685492.0, "reward": 1.0059814453125, "reward_std": 0.28412413597106934, "rewards/accuracy_reward/mean": 0.7578125, "rewards/accuracy_reward/std": 0.4292463958263397, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1080.0, "completions/max_terminated_length": 1080.0, "completions/mean_length": 548.640625, "completions/mean_terminated_length": 548.640625, "completions/min_length": 164.0, "completions/min_terminated_length": 164.0, "epoch": 0.1795862473712525, "frac_reward_zero_std": 0.4375, "grad_norm": 0.23641627111597707, "kl": 0.5498046875, "learning_rate": 1.962176659084373e-05, "loss": 0.036, "num_tokens": 65856488.0, "reward": 1.0625, "reward_std": 0.23182064294815063, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 929.0, "completions/max_terminated_length": 929.0, "completions/mean_length": 483.484375, "completions/mean_terminated_length": 483.484375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.17997750281214847, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3306133549898433, "kl": 0.541015625, "learning_rate": 1.9618036518932783e-05, "loss": 0.0102, "num_tokens": 66010052.0, "reward": 1.0606689453125, "reward_std": 0.23660477995872498, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1099.0, "completions/max_terminated_length": 1099.0, "completions/mean_length": 509.64453125, "completions/mean_terminated_length": 509.64453125, "completions/min_length": 143.0, "completions/min_terminated_length": 143.0, "epoch": 0.18036875825304446, "frac_reward_zero_std": 0.125, "grad_norm": 0.31662737573903865, "kl": 0.56884765625, "learning_rate": 1.961428850256044e-05, "loss": 0.0098, "num_tokens": 66170841.0, "reward": 0.8861083984375, "reward_std": 0.3439856767654419, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1009.0, "completions/mean_length": 433.0703125, "completions/mean_terminated_length": 407.4365234375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.18076001369394043, "frac_reward_zero_std": 0.375, "grad_norm": 2.0935536622189144, "kl": 0.64453125, "learning_rate": 1.961052254871941e-05, "loss": 0.1655, "num_tokens": 66309947.0, "reward": 0.9931640625, "reward_std": 0.2200741022825241, "rewards/accuracy_reward/mean": 0.74609375, "rewards/accuracy_reward/std": 0.4360972046852112, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06213126704096794, "step": 462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1040.0, "completions/max_terminated_length": 1040.0, "completions/mean_length": 495.2109375, "completions/mean_terminated_length": 495.2109375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.1811512691348364, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2743496250363356, "kl": 0.54541015625, "learning_rate": 1.9606738664435863e-05, "loss": 0.0305, "num_tokens": 66469105.0, "reward": 1.0228271484375, "reward_std": 0.2774507999420166, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 961.0, "completions/max_terminated_length": 961.0, "completions/mean_length": 432.33203125, "completions/mean_terminated_length": 432.33203125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.1815425245757324, "frac_reward_zero_std": 0.25, "grad_norm": 0.2715757516671629, "kl": 0.576171875, "learning_rate": 1.9602936856769432e-05, "loss": 0.0225, "num_tokens": 66610694.0, "reward": 0.971435546875, "reward_std": 0.26197534799575806, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1327.0, "completions/max_terminated_length": 1327.0, "completions/mean_length": 474.41015625, "completions/mean_terminated_length": 474.41015625, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.18193378001662835, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3030026743909708, "kl": 0.5419921875, "learning_rate": 1.9599117132813187e-05, "loss": 0.0306, "num_tokens": 66763007.0, "reward": 0.97265625, "reward_std": 0.23030208051204681, "rewards/accuracy_reward/mean": 0.72265625, "rewards/accuracy_reward/std": 0.4485645890235901, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 360.90234375, "completions/mean_terminated_length": 360.90234375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.18232503545752432, "frac_reward_zero_std": 0.3125, "grad_norm": 0.30674474009930136, "kl": 0.5986328125, "learning_rate": 1.9595279499693617e-05, "loss": 0.0226, "num_tokens": 66884230.0, "reward": 0.9921875, "reward_std": 0.29470524191856384, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1362.0, "completions/max_terminated_length": 1362.0, "completions/mean_length": 344.96484375, "completions/mean_terminated_length": 344.96484375, "completions/min_length": 155.0, "completions/min_terminated_length": 155.0, "epoch": 0.1827162908984203, "frac_reward_zero_std": 0.375, "grad_norm": 0.6719518289261955, "kl": 0.537109375, "learning_rate": 1.9591423964570634e-05, "loss": 0.0347, "num_tokens": 67003693.0, "reward": 0.95703125, "reward_std": 0.2799493670463562, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 730.0, "completions/max_terminated_length": 730.0, "completions/mean_length": 355.0546875, "completions/mean_terminated_length": 355.0546875, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.18310754633931628, "frac_reward_zero_std": 0.125, "grad_norm": 0.3455175920405437, "kl": 0.544921875, "learning_rate": 1.958755053463755e-05, "loss": 0.0287, "num_tokens": 67124795.0, "reward": 0.9556884765625, "reward_std": 0.3404730260372162, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 834.0, "completions/max_terminated_length": 834.0, "completions/mean_length": 324.44921875, "completions/mean_terminated_length": 324.44921875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.18349880178021225, "frac_reward_zero_std": 0.375, "grad_norm": 0.3109560434560615, "kl": 0.5302734375, "learning_rate": 1.9583659217121048e-05, "loss": 0.0167, "num_tokens": 67236350.0, "reward": 0.9921875, "reward_std": 0.2806527018547058, "rewards/accuracy_reward/mean": 0.7421875, "rewards/accuracy_reward/std": 0.4382871091365814, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 655.0, "completions/max_terminated_length": 655.0, "completions/mean_length": 317.62890625, "completions/mean_terminated_length": 317.62890625, "completions/min_length": 101.0, "completions/min_terminated_length": 101.0, "epoch": 0.18389005722110824, "frac_reward_zero_std": 0.25, "grad_norm": 0.3310399885603887, "kl": 0.5419921875, "learning_rate": 1.957975001928121e-05, "loss": 0.0192, "num_tokens": 67346863.0, "reward": 1.0494384765625, "reward_std": 0.24255643784999847, "rewards/accuracy_reward/mean": 0.80078125, "rewards/accuracy_reward/std": 0.40019527077674866, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 821.0, "completions/max_terminated_length": 821.0, "completions/mean_length": 332.63671875, "completions/mean_terminated_length": 332.63671875, "completions/min_length": 77.0, "completions/min_terminated_length": 77.0, "epoch": 0.1842813126620042, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2828772246645858, "kl": 0.5361328125, "learning_rate": 1.9575822948411454e-05, "loss": 0.0004, "num_tokens": 67461474.0, "reward": 0.9793701171875, "reward_std": 0.21436947584152222, "rewards/accuracy_reward/mean": 0.73046875, "rewards/accuracy_reward/std": 0.44458550214767456, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 744.0, "completions/max_terminated_length": 744.0, "completions/mean_length": 334.8984375, "completions/mean_terminated_length": 334.8984375, "completions/min_length": 121.0, "completions/min_terminated_length": 121.0, "epoch": 0.18467256810290017, "frac_reward_zero_std": 0.375, "grad_norm": 0.28062416440682464, "kl": 0.467041015625, "learning_rate": 1.9571878011838557e-05, "loss": 0.0083, "num_tokens": 67576760.0, "reward": 1.0657958984375, "reward_std": 0.1947728991508484, "rewards/accuracy_reward/mean": 0.81640625, "rewards/accuracy_reward/std": 0.387910932302475, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 618.0, "completions/max_terminated_length": 618.0, "completions/mean_length": 312.7734375, "completions/mean_terminated_length": 312.7734375, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.18506382354379616, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3094904466760826, "kl": 0.5234375, "learning_rate": 1.9567915216922624e-05, "loss": 0.0188, "num_tokens": 67687070.0, "reward": 1.0625, "reward_std": 0.14954319596290588, "rewards/accuracy_reward/mean": 0.8125, "rewards/accuracy_reward/std": 0.3910769522190094, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 378.85546875, "completions/mean_terminated_length": 378.85546875, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.18545507898469213, "frac_reward_zero_std": 0.375, "grad_norm": 0.30060413287041127, "kl": 0.509033203125, "learning_rate": 1.9563934571057074e-05, "loss": 0.0209, "num_tokens": 67815385.0, "reward": 0.9251708984375, "reward_std": 0.20592600107192993, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 326.1484375, "completions/mean_terminated_length": 326.1484375, "completions/min_length": 99.0, "completions/min_terminated_length": 99.0, "epoch": 0.1858463344255881, "frac_reward_zero_std": 0.5, "grad_norm": 0.3046408215217034, "kl": 0.58984375, "learning_rate": 1.9559936081668644e-05, "loss": 0.0351, "num_tokens": 67929327.0, "reward": 1.0234375, "reward_std": 0.21103434264659882, "rewards/accuracy_reward/mean": 0.7734375, "rewards/accuracy_reward/std": 0.41942715644836426, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 607.0, "completions/max_terminated_length": 607.0, "completions/mean_length": 296.41015625, "completions/mean_terminated_length": 296.41015625, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1862375898664841, "frac_reward_zero_std": 0.5, "grad_norm": 0.29221745389617126, "kl": 0.60400390625, "learning_rate": 1.9555919756217346e-05, "loss": 0.0268, "num_tokens": 68034312.0, "reward": 1.0928955078125, "reward_std": 0.18313390016555786, "rewards/accuracy_reward/mean": 0.84375, "rewards/accuracy_reward/std": 0.3638034462928772, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.0468750037252903, "step": 476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1069.0, "completions/max_terminated_length": 1069.0, "completions/mean_length": 378.9375, "completions/mean_terminated_length": 378.9375, "completions/min_length": 140.0, "completions/min_terminated_length": 140.0, "epoch": 0.18662884530738005, "frac_reward_zero_std": 0.4375, "grad_norm": 0.24341813602632736, "kl": 0.510009765625, "learning_rate": 1.9551885602196482e-05, "loss": 0.0053, "num_tokens": 68160840.0, "reward": 0.921875, "reward_std": 0.2241290956735611, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 923.0, "completions/max_terminated_length": 923.0, "completions/mean_length": 330.5390625, "completions/mean_terminated_length": 330.5390625, "completions/min_length": 113.0, "completions/min_terminated_length": 113.0, "epoch": 0.18702010074827602, "frac_reward_zero_std": 0.25, "grad_norm": 0.48535803704397973, "kl": 0.6708984375, "learning_rate": 1.954783362713261e-05, "loss": 0.0364, "num_tokens": 68275538.0, "reward": 1.069580078125, "reward_std": 0.3111705780029297, "rewards/accuracy_reward/mean": 0.8203125, "rewards/accuracy_reward/std": 0.38467901945114136, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 828.0, "completions/max_terminated_length": 828.0, "completions/mean_length": 398.15625, "completions/mean_terminated_length": 398.15625, "completions/min_length": 154.0, "completions/min_terminated_length": 154.0, "epoch": 0.187411356189172, "frac_reward_zero_std": 0.3125, "grad_norm": 0.28447595829859773, "kl": 0.539306640625, "learning_rate": 1.954376383858553e-05, "loss": 0.0384, "num_tokens": 68407386.0, "reward": 0.893798828125, "reward_std": 0.2778567373752594, "rewards/accuracy_reward/mean": 0.64453125, "rewards/accuracy_reward/std": 0.4795927405357361, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 967.0, "completions/max_terminated_length": 967.0, "completions/mean_length": 394.48046875, "completions/mean_terminated_length": 394.48046875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.18780261163006798, "frac_reward_zero_std": 0.375, "grad_norm": 0.30627256444695355, "kl": 0.568359375, "learning_rate": 1.9539676244148295e-05, "loss": 0.0288, "num_tokens": 68538085.0, "reward": 1.037109375, "reward_std": 0.23040425777435303, "rewards/accuracy_reward/mean": 0.7890625, "rewards/accuracy_reward/std": 0.4087733030319214, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 760.0, "completions/max_terminated_length": 760.0, "completions/mean_length": 400.3828125, "completions/mean_terminated_length": 400.3828125, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.18819386707096394, "frac_reward_zero_std": 0.25, "grad_norm": 0.3113265563485327, "kl": 0.580078125, "learning_rate": 1.9535570851447166e-05, "loss": 0.0017, "num_tokens": 68670791.0, "reward": 0.90869140625, "reward_std": 0.2443283200263977, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.96875, "rewards/format_reward/std": 0.17433346807956696, "rewards/tag_count_reward/mean": 0.98828125, "rewards/tag_count_reward/std": 0.07579238712787628, "step": 481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 867.0, "completions/max_terminated_length": 867.0, "completions/mean_length": 363.55078125, "completions/mean_terminated_length": 363.55078125, "completions/min_length": 130.0, "completions/min_terminated_length": 130.0, "epoch": 0.18858512251185994, "frac_reward_zero_std": 0.25, "grad_norm": 0.32061889901274127, "kl": 0.58740234375, "learning_rate": 1.953144766814161e-05, "loss": 0.0195, "num_tokens": 68793332.0, "reward": 0.94921875, "reward_std": 0.29337677359580994, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 782.0, "completions/max_terminated_length": 782.0, "completions/mean_length": 336.73046875, "completions/mean_terminated_length": 336.73046875, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.1889763779527559, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3505850216781554, "kl": 0.58544921875, "learning_rate": 1.9527306701924292e-05, "loss": 0.0453, "num_tokens": 68908975.0, "reward": 1.1328125, "reward_std": 0.276502788066864, "rewards/accuracy_reward/mean": 0.8828125, "rewards/accuracy_reward/std": 0.3222736418247223, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 826.0, "completions/max_terminated_length": 826.0, "completions/mean_length": 407.96484375, "completions/mean_terminated_length": 407.96484375, "completions/min_length": 133.0, "completions/min_terminated_length": 133.0, "epoch": 0.18936763339365187, "frac_reward_zero_std": 0.375, "grad_norm": 0.27450255934755674, "kl": 0.61181640625, "learning_rate": 1.952314796052105e-05, "loss": 0.0367, "num_tokens": 69043894.0, "reward": 0.7486572265625, "reward_std": 0.22396209836006165, "rewards/accuracy_reward/mean": 0.5, "rewards/accuracy_reward/std": 0.5009794235229492, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1296.0, "completions/max_terminated_length": 1296.0, "completions/mean_length": 426.37890625, "completions/mean_terminated_length": 426.37890625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.18975888883454786, "frac_reward_zero_std": 0.25, "grad_norm": 0.2942859334863411, "kl": 0.546875, "learning_rate": 1.9518971451690885e-05, "loss": 0.0385, "num_tokens": 69183975.0, "reward": 0.8704833984375, "reward_std": 0.2731557786464691, "rewards/accuracy_reward/mean": 0.62109375, "rewards/accuracy_reward/std": 0.4860650300979614, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 823.0, "completions/max_terminated_length": 823.0, "completions/mean_length": 362.98828125, "completions/mean_terminated_length": 362.98828125, "completions/min_length": 126.0, "completions/min_terminated_length": 126.0, "epoch": 0.19015014427544383, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3232427348356337, "kl": 0.583984375, "learning_rate": 1.951477718322595e-05, "loss": 0.0103, "num_tokens": 69306580.0, "reward": 0.95703125, "reward_std": 0.2679290771484375, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 388.48828125, "completions/mean_terminated_length": 388.48828125, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.1905413997163398, "frac_reward_zero_std": 0.25, "grad_norm": 0.47409750221837876, "kl": 0.5888671875, "learning_rate": 1.9510565162951538e-05, "loss": -0.0039, "num_tokens": 69437281.0, "reward": 0.9405517578125, "reward_std": 0.290461003780365, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1035.0, "completions/max_terminated_length": 1035.0, "completions/mean_length": 432.296875, "completions/mean_terminated_length": 432.296875, "completions/min_length": 138.0, "completions/min_terminated_length": 138.0, "epoch": 0.1909326551572358, "frac_reward_zero_std": 0.4375, "grad_norm": 0.2676754804543358, "kl": 0.5888671875, "learning_rate": 1.9506335398726048e-05, "loss": 0.0098, "num_tokens": 69577485.0, "reward": 0.8359375, "reward_std": 0.22075961530208588, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1375.0, "completions/max_terminated_length": 1375.0, "completions/mean_length": 419.75390625, "completions/mean_terminated_length": 419.75390625, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.19132391059813175, "frac_reward_zero_std": 0.25, "grad_norm": 0.31939197475167147, "kl": 0.5927734375, "learning_rate": 1.9502087898440988e-05, "loss": 0.0442, "num_tokens": 69716158.0, "reward": 0.947509765625, "reward_std": 0.26380011439323425, "rewards/accuracy_reward/mean": 0.69921875, "rewards/accuracy_reward/std": 0.45949608087539673, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 808.0, "completions/max_terminated_length": 808.0, "completions/mean_length": 381.0859375, "completions/mean_terminated_length": 381.0859375, "completions/min_length": 173.0, "completions/min_terminated_length": 173.0, "epoch": 0.19171516603902772, "frac_reward_zero_std": 0.3125, "grad_norm": 0.2761735596643201, "kl": 0.63916015625, "learning_rate": 1.9497822670020966e-05, "loss": 0.0047, "num_tokens": 69840276.0, "reward": 0.95947265625, "reward_std": 0.22576779127120972, "rewards/accuracy_reward/mean": 0.7109375, "rewards/accuracy_reward/std": 0.45421501994132996, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 906.0, "completions/max_terminated_length": 906.0, "completions/mean_length": 420.4921875, "completions/mean_terminated_length": 420.4921875, "completions/min_length": 167.0, "completions/min_terminated_length": 167.0, "epoch": 0.1921064214799237, "frac_reward_zero_std": 0.25, "grad_norm": 0.3248443796893699, "kl": 0.591796875, "learning_rate": 1.949353972142366e-05, "loss": 0.0175, "num_tokens": 69975682.0, "reward": 1.0135498046875, "reward_std": 0.23389016091823578, "rewards/accuracy_reward/mean": 0.765625, "rewards/accuracy_reward/std": 0.42443734407424927, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.04671131819486618, "step": 491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 964.0, "completions/max_terminated_length": 964.0, "completions/mean_length": 420.30859375, "completions/mean_terminated_length": 420.30859375, "completions/min_length": 161.0, "completions/min_terminated_length": 161.0, "epoch": 0.19249767692081968, "frac_reward_zero_std": 0.125, "grad_norm": 0.38896131401251777, "kl": 0.65771484375, "learning_rate": 1.94892390606398e-05, "loss": 0.0443, "num_tokens": 70113025.0, "reward": 0.784912109375, "reward_std": 0.30485638976097107, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.990234375, "rewards/tag_count_reward/std": 0.06569644808769226, "step": 492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 973.0, "completions/max_terminated_length": 973.0, "completions/mean_length": 423.3828125, "completions/mean_terminated_length": 423.3828125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.19288893236171564, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3359329446383006, "kl": 0.61474609375, "learning_rate": 1.9484920695693176e-05, "loss": 0.0433, "num_tokens": 70253891.0, "reward": 0.7916259765625, "reward_std": 0.366222083568573, "rewards/accuracy_reward/mean": 0.54296875, "rewards/accuracy_reward/std": 0.4991260766983032, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 816.0, "completions/max_terminated_length": 816.0, "completions/mean_length": 361.87890625, "completions/mean_terminated_length": 361.87890625, "completions/min_length": 131.0, "completions/min_terminated_length": 131.0, "epoch": 0.19328018780261164, "frac_reward_zero_std": 0.375, "grad_norm": 0.30971390623164585, "kl": 0.69775390625, "learning_rate": 1.94805846346406e-05, "loss": 0.0158, "num_tokens": 70374244.0, "reward": 0.9244384765625, "reward_std": 0.20503181219100952, "rewards/accuracy_reward/mean": 0.67578125, "rewards/accuracy_reward/std": 0.46899911761283875, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 737.0, "completions/max_terminated_length": 737.0, "completions/mean_length": 347.6796875, "completions/mean_terminated_length": 347.6796875, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.1936714432435076, "frac_reward_zero_std": 0.25, "grad_norm": 0.335677833759544, "kl": 0.68994140625, "learning_rate": 1.94762308855719e-05, "loss": 0.0087, "num_tokens": 70492258.0, "reward": 0.8134765625, "reward_std": 0.2832539677619934, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9921875, "rewards/tag_count_reward/std": 0.06213126704096794, "step": 495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 907.0, "completions/max_terminated_length": 907.0, "completions/mean_length": 354.60546875, "completions/mean_terminated_length": 354.60546875, "completions/min_length": 108.0, "completions/min_terminated_length": 108.0, "epoch": 0.19406269868440357, "frac_reward_zero_std": 0.1875, "grad_norm": 0.35754016959981266, "kl": 0.6669921875, "learning_rate": 1.947185945660991e-05, "loss": 0.0149, "num_tokens": 70612749.0, "reward": 0.9417724609375, "reward_std": 0.26696205139160156, "rewards/accuracy_reward/mean": 0.6953125, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9912109375, "rewards/tag_count_reward/std": 0.0639462023973465, "step": 496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 753.0, "completions/max_terminated_length": 753.0, "completions/mean_length": 369.29296875, "completions/mean_terminated_length": 369.29296875, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.19445395412529956, "frac_reward_zero_std": 0.125, "grad_norm": 0.35250474518910946, "kl": 0.63525390625, "learning_rate": 1.9467470355910438e-05, "loss": 0.0145, "num_tokens": 70737464.0, "reward": 0.8427734375, "reward_std": 0.341018944978714, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49209436774253845, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 705.0, "completions/max_terminated_length": 705.0, "completions/mean_length": 361.0859375, "completions/mean_terminated_length": 361.0859375, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.19484520956619553, "frac_reward_zero_std": 0.25, "grad_norm": 0.3596142047153135, "kl": 0.6708984375, "learning_rate": 1.9463063591662284e-05, "loss": 0.0264, "num_tokens": 70860862.0, "reward": 0.77197265625, "reward_std": 0.2818782329559326, "rewards/accuracy_reward/mean": 0.5234375, "rewards/accuracy_reward/std": 0.5004287362098694, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 766.0, "completions/max_terminated_length": 766.0, "completions/mean_length": 397.4765625, "completions/mean_terminated_length": 397.4765625, "completions/min_length": 163.0, "completions/min_terminated_length": 163.0, "epoch": 0.1952364650070915, "frac_reward_zero_std": 0.1875, "grad_norm": 0.33888810908038103, "kl": 0.6259765625, "learning_rate": 1.945863917208718e-05, "loss": 0.0161, "num_tokens": 70992232.0, "reward": 0.7520751953125, "reward_std": 0.30110833048820496, "rewards/accuracy_reward/mean": 0.50390625, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 928.0, "completions/max_terminated_length": 928.0, "completions/mean_length": 441.87890625, "completions/mean_terminated_length": 441.87890625, "completions/min_length": 170.0, "completions/min_terminated_length": 170.0, "epoch": 0.19562772044798749, "frac_reward_zero_std": 0.25, "grad_norm": 0.3164276666618122, "kl": 0.70166015625, "learning_rate": 1.9454197105439813e-05, "loss": 0.0107, "num_tokens": 71134921.0, "reward": 0.7449951171875, "reward_std": 0.23562279343605042, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1473.0, "completions/max_terminated_length": 1473.0, "completions/mean_length": 562.45703125, "completions/mean_terminated_length": 562.45703125, "completions/min_length": 261.0, "completions/min_terminated_length": 261.0, "epoch": 0.19601897588888345, "frac_reward_zero_std": 0.0, "grad_norm": 0.32501823324135515, "kl": 0.64013671875, "learning_rate": 1.944973740000778e-05, "loss": 0.0019, "num_tokens": 71310494.0, "reward": 0.78759765625, "reward_std": 0.30586785078048706, "rewards/accuracy_reward/mean": 0.54296875, "rewards/accuracy_reward/std": 0.4991260766983032, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.97265625, "rewards/tag_count_reward/std": 0.09253709018230438, "step": 501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1466.0, "completions/max_terminated_length": 1466.0, "completions/mean_length": 587.0390625, "completions/mean_terminated_length": 587.0390625, "completions/min_length": 238.0, "completions/min_terminated_length": 238.0, "epoch": 0.19641023132977942, "frac_reward_zero_std": 0.0, "grad_norm": 0.30103845856345607, "kl": 0.69384765625, "learning_rate": 1.9445260064111608e-05, "loss": 0.0447, "num_tokens": 71492280.0, "reward": 0.8690185546875, "reward_std": 0.28866061568260193, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.48404383659362793, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9365234375, "rewards/tag_count_reward/std": 0.13870558142662048, "step": 502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1980.0, "completions/max_terminated_length": 1980.0, "completions/mean_length": 641.671875, "completions/mean_terminated_length": 641.671875, "completions/min_length": 186.0, "completions/min_terminated_length": 186.0, "epoch": 0.1968014867706754, "frac_reward_zero_std": 0.0, "grad_norm": 0.31264346410930977, "kl": 0.64990234375, "learning_rate": 1.9440765106104693e-05, "loss": 0.1028, "num_tokens": 71684980.0, "reward": 0.8787841796875, "reward_std": 0.2753908038139343, "rewards/accuracy_reward/mean": 0.63671875, "rewards/accuracy_reward/std": 0.48188701272010803, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9404296875, "rewards/tag_count_reward/std": 0.12763386964797974, "step": 503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1606.0, "completions/max_terminated_length": 1606.0, "completions/mean_length": 652.76171875, "completions/mean_terminated_length": 652.76171875, "completions/min_length": 239.0, "completions/min_terminated_length": 239.0, "epoch": 0.19719274221157138, "frac_reward_zero_std": 0.25, "grad_norm": 0.24927305030933605, "kl": 0.57958984375, "learning_rate": 1.9436252534373328e-05, "loss": 0.0196, "num_tokens": 71880231.0, "reward": 0.6778564453125, "reward_std": 0.33135682344436646, "rewards/accuracy_reward/mean": 0.4296875, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1593.0, "completions/max_terminated_length": 1593.0, "completions/mean_length": 607.99609375, "completions/mean_terminated_length": 607.99609375, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.19758399765246734, "frac_reward_zero_std": 0.25, "grad_norm": 0.2603482743196677, "kl": 0.5986328125, "learning_rate": 1.9431722357336657e-05, "loss": 0.0036, "num_tokens": 72065606.0, "reward": 0.81396484375, "reward_std": 0.22619251906871796, "rewards/accuracy_reward/mean": 0.56640625, "rewards/accuracy_reward/std": 0.4965413510799408, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.044107433408498764, "step": 505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1144.0, "completions/max_terminated_length": 1144.0, "completions/mean_length": 612.83203125, "completions/mean_terminated_length": 612.83203125, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.19797525309336333, "frac_reward_zero_std": 0.375, "grad_norm": 0.22989954530537585, "kl": 0.611328125, "learning_rate": 1.9427174583446677e-05, "loss": 0.0114, "num_tokens": 72251307.0, "reward": 0.84765625, "reward_std": 0.23803657293319702, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.96875, "completions/max_length": 2048.0, "completions/max_terminated_length": 2010.0, "completions/mean_length": 773.19140625, "completions/mean_terminated_length": 752.9564208984375, "completions/min_length": 334.0, "completions/min_terminated_length": 334.0, "epoch": 0.1983665085342593, "frac_reward_zero_std": 0.125, "grad_norm": 0.24071640182568313, "kl": 0.55908203125, "learning_rate": 1.9422609221188208e-05, "loss": 0.0524, "num_tokens": 72479004.0, "reward": 0.76220703125, "reward_std": 0.30588579177856445, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.96484375, "rewards/tag_count_reward/std": 0.12221155315637589, "step": 507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9765625, "completions/max_length": 2048.0, "completions/max_terminated_length": 1943.0, "completions/mean_length": 806.08984375, "completions/mean_terminated_length": 791.3636474609375, "completions/min_length": 305.0, "completions/min_terminated_length": 305.0, "epoch": 0.19875776397515527, "frac_reward_zero_std": 0.0, "grad_norm": 0.2612779047195048, "kl": 0.52587890625, "learning_rate": 1.941802627907889e-05, "loss": 0.0636, "num_tokens": 72713619.0, "reward": 0.8126220703125, "reward_std": 0.3235282301902771, "rewards/accuracy_reward/mean": 0.5703125, "rewards/accuracy_reward/std": 0.4960011839866638, "rewards/format_reward/mean": 0.96484375, "rewards/format_reward/std": 0.18453538417816162, "rewards/tag_count_reward/mean": 0.9736328125, "rewards/tag_count_reward/std": 0.09920313209295273, "step": 508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1693.0, "completions/max_terminated_length": 1693.0, "completions/mean_length": 740.40625, "completions/mean_terminated_length": 740.40625, "completions/min_length": 274.0, "completions/min_terminated_length": 274.0, "epoch": 0.19914901941605126, "frac_reward_zero_std": 0.25, "grad_norm": 0.21965249903430187, "kl": 0.5390625, "learning_rate": 1.9413425765669166e-05, "loss": 0.0018, "num_tokens": 72932091.0, "reward": 0.939697265625, "reward_std": 0.2897465229034424, "rewards/accuracy_reward/mean": 0.69140625, "rewards/accuracy_reward/std": 0.46281787753105164, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 2041.0, "completions/max_terminated_length": 2041.0, "completions/mean_length": 782.4296875, "completions/mean_terminated_length": 782.4296875, "completions/min_length": 207.0, "completions/min_terminated_length": 207.0, "epoch": 0.19954027485694723, "frac_reward_zero_std": 0.4375, "grad_norm": 0.1825192229268043, "kl": 0.53125, "learning_rate": 1.9408807689542257e-05, "loss": 0.0316, "num_tokens": 73160617.0, "reward": 0.690673828125, "reward_std": 0.18843401968479156, "rewards/accuracy_reward/mean": 0.44140625, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1584.0, "completions/max_terminated_length": 1584.0, "completions/mean_length": 747.98828125, "completions/mean_terminated_length": 747.98828125, "completions/min_length": 323.0, "completions/min_terminated_length": 323.0, "epoch": 0.1999315302978432, "frac_reward_zero_std": 0.1875, "grad_norm": 0.2363596656735533, "kl": 0.55078125, "learning_rate": 1.9404172059314146e-05, "loss": 0.0515, "num_tokens": 73382150.0, "reward": 0.742431640625, "reward_std": 0.3111479580402374, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.994140625, "rewards/tag_count_reward/std": 0.04388983175158501, "step": 511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1286.0, "completions/max_terminated_length": 1286.0, "completions/mean_length": 655.76953125, "completions/mean_terminated_length": 655.76953125, "completions/min_length": 272.0, "completions/min_terminated_length": 272.0, "epoch": 0.20032278573873918, "frac_reward_zero_std": 0.25, "grad_norm": 0.24826469283066055, "kl": 0.5966796875, "learning_rate": 1.9399518883633575e-05, "loss": 0.0161, "num_tokens": 73579291.0, "reward": 0.787841796875, "reward_std": 0.3008278012275696, "rewards/accuracy_reward/mean": 0.5390625, "rewards/accuracy_reward/std": 0.4994482398033142, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1579.0, "completions/max_terminated_length": 1579.0, "completions/mean_length": 754.93359375, "completions/mean_terminated_length": 754.93359375, "completions/min_length": 229.0, "completions/min_terminated_length": 229.0, "epoch": 0.20071404117963515, "frac_reward_zero_std": 0.3125, "grad_norm": 0.23149760140210035, "kl": 0.61279296875, "learning_rate": 1.939484817118202e-05, "loss": 0.0292, "num_tokens": 73802170.0, "reward": 0.553466796875, "reward_std": 0.315522700548172, "rewards/accuracy_reward/mean": 0.3046875, "rewards/accuracy_reward/std": 0.4611765742301941, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1441.0, "completions/max_terminated_length": 1441.0, "completions/mean_length": 677.55078125, "completions/mean_terminated_length": 677.55078125, "completions/min_length": 231.0, "completions/min_terminated_length": 231.0, "epoch": 0.20110529662053112, "frac_reward_zero_std": 0.125, "grad_norm": 0.25515722726888546, "kl": 0.59130859375, "learning_rate": 1.9390159930673667e-05, "loss": 0.0368, "num_tokens": 74005527.0, "reward": 0.8204345703125, "reward_std": 0.32211366295814514, "rewards/accuracy_reward/mean": 0.57421875, "rewards/accuracy_reward/std": 0.49542948603630066, "rewards/format_reward/mean": 0.9765625, "rewards/format_reward/std": 0.15158477425575256, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.0464647077023983, "step": 514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1450.0, "completions/max_terminated_length": 1450.0, "completions/mean_length": 586.03515625, "completions/mean_terminated_length": 586.03515625, "completions/min_length": 257.0, "completions/min_terminated_length": 257.0, "epoch": 0.2014965520614271, "frac_reward_zero_std": 0.125, "grad_norm": 0.2609411809632109, "kl": 0.61376953125, "learning_rate": 1.9385454170855416e-05, "loss": 0.0189, "num_tokens": 74184544.0, "reward": 0.807373046875, "reward_std": 0.3796541392803192, "rewards/accuracy_reward/mean": 0.55859375, "rewards/accuracy_reward/std": 0.4975275993347168, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1356.0, "completions/max_terminated_length": 1356.0, "completions/mean_length": 597.6953125, "completions/mean_terminated_length": 597.6953125, "completions/min_length": 275.0, "completions/min_terminated_length": 275.0, "epoch": 0.20188780750232307, "frac_reward_zero_std": 0.1875, "grad_norm": 0.25908204825659875, "kl": 0.59033203125, "learning_rate": 1.9380730900506846e-05, "loss": 0.0414, "num_tokens": 74368242.0, "reward": 0.7652587890625, "reward_std": 0.2616797685623169, "rewards/accuracy_reward/mean": 0.51953125, "rewards/accuracy_reward/std": 0.5005971193313599, "rewards/format_reward/mean": 0.97265625, "rewards/format_reward/std": 0.1634024828672409, "rewards/tag_count_reward/mean": 0.9931640625, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 957.0, "completions/max_terminated_length": 957.0, "completions/mean_length": 463.0546875, "completions/mean_terminated_length": 463.0546875, "completions/min_length": 220.0, "completions/min_terminated_length": 220.0, "epoch": 0.20227906294321907, "frac_reward_zero_std": 0.1875, "grad_norm": 0.28837245325631927, "kl": 0.7109375, "learning_rate": 1.9375990128440205e-05, "loss": 0.0147, "num_tokens": 74514336.0, "reward": 0.8953857421875, "reward_std": 0.25247257947921753, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1156.0, "completions/max_terminated_length": 1156.0, "completions/mean_length": 442.73828125, "completions/mean_terminated_length": 442.73828125, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.20267031838411503, "frac_reward_zero_std": 0.125, "grad_norm": 0.332375045035177, "kl": 0.72412109375, "learning_rate": 1.93712318635004e-05, "loss": 0.037, "num_tokens": 74658205.0, "reward": 0.7781982421875, "reward_std": 0.3241286277770996, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 732.0, "completions/max_terminated_length": 732.0, "completions/mean_length": 393.6484375, "completions/mean_terminated_length": 393.6484375, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.203061573825011, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3551149693018321, "kl": 0.70849609375, "learning_rate": 1.9366456114564965e-05, "loss": 0.0463, "num_tokens": 74788595.0, "reward": 0.8446044921875, "reward_std": 0.36739516258239746, "rewards/accuracy_reward/mean": 0.59765625, "rewards/accuracy_reward/std": 0.4913311004638672, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 718.0, "completions/max_terminated_length": 718.0, "completions/mean_length": 348.765625, "completions/mean_terminated_length": 348.765625, "completions/min_length": 162.0, "completions/min_terminated_length": 162.0, "epoch": 0.203452829265907, "frac_reward_zero_std": 0.25, "grad_norm": 0.3289066099536819, "kl": 0.7177734375, "learning_rate": 1.9361662890544068e-05, "loss": 0.0059, "num_tokens": 74907143.0, "reward": 0.7640380859375, "reward_std": 0.31119757890701294, "rewards/accuracy_reward/mean": 0.515625, "rewards/accuracy_reward/std": 0.5007347464561462, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.05169277638196945, "step": 520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 683.0, "completions/mean_length": 337.1640625, "completions/mean_terminated_length": 330.4549255371094, "completions/min_length": 147.0, "completions/min_terminated_length": 147.0, "epoch": 0.20384408470680296, "frac_reward_zero_std": 0.375, "grad_norm": 0.3545663183805474, "kl": 0.70361328125, "learning_rate": 1.9356852200380466e-05, "loss": 0.0776, "num_tokens": 75023345.0, "reward": 0.7540283203125, "reward_std": 0.20631703734397888, "rewards/accuracy_reward/mean": 0.5078125, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.9892578125, "rewards/tag_count_reward/std": 0.07753453403711319, "step": 521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 734.0, "completions/max_terminated_length": 734.0, "completions/mean_length": 305.14453125, "completions/mean_terminated_length": 305.14453125, "completions/min_length": 98.0, "completions/min_terminated_length": 98.0, "epoch": 0.20423534014769892, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3516356845442845, "kl": 0.7001953125, "learning_rate": 1.935202405304951e-05, "loss": 0.0206, "num_tokens": 75129718.0, "reward": 0.96630859375, "reward_std": 0.230906680226326, "rewards/accuracy_reward/mean": 0.71875, "rewards/accuracy_reward/std": 0.45048993825912476, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 770.0, "completions/max_terminated_length": 770.0, "completions/mean_length": 359.65234375, "completions/mean_terminated_length": 359.65234375, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.20462659558859492, "frac_reward_zero_std": 0.3125, "grad_norm": 0.30572198774497295, "kl": 0.6689453125, "learning_rate": 1.934717845755912e-05, "loss": 0.007, "num_tokens": 75250301.0, "reward": 0.740966796875, "reward_std": 0.26204198598861694, "rewards/accuracy_reward/mean": 0.4921875, "rewards/accuracy_reward/std": 0.5009182691574097, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 827.0, "completions/max_terminated_length": 827.0, "completions/mean_length": 358.16796875, "completions/mean_terminated_length": 358.16796875, "completions/min_length": 132.0, "completions/min_terminated_length": 132.0, "epoch": 0.20501785102949088, "frac_reward_zero_std": 0.25, "grad_norm": 0.32470989765836544, "kl": 0.6494140625, "learning_rate": 1.9342315422949772e-05, "loss": 0.0325, "num_tokens": 75370008.0, "reward": 0.8770751953125, "reward_std": 0.2717050313949585, "rewards/accuracy_reward/mean": 0.62890625, "rewards/accuracy_reward/std": 0.48404383659362793, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1024.0, "completions/max_terminated_length": 1024.0, "completions/mean_length": 362.5703125, "completions/mean_terminated_length": 362.5703125, "completions/min_length": 127.0, "completions/min_terminated_length": 127.0, "epoch": 0.20540910647038685, "frac_reward_zero_std": 0.375, "grad_norm": 0.32579821385736973, "kl": 0.62255859375, "learning_rate": 1.933743495829447e-05, "loss": 0.0378, "num_tokens": 75491754.0, "reward": 0.928466796875, "reward_std": 0.2317776381969452, "rewards/accuracy_reward/mean": 0.6796875, "rewards/accuracy_reward/std": 0.4675106406211853, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1253.0, "completions/max_terminated_length": 1253.0, "completions/mean_length": 360.90234375, "completions/mean_terminated_length": 360.90234375, "completions/min_length": 165.0, "completions/min_terminated_length": 165.0, "epoch": 0.20580036191128284, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3033471930985333, "kl": 0.610107421875, "learning_rate": 1.933253707269875e-05, "loss": 0.0412, "num_tokens": 75614849.0, "reward": 0.9095458984375, "reward_std": 0.21290194988250732, "rewards/accuracy_reward/mean": 0.66015625, "rewards/accuracy_reward/std": 0.47458380460739136, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 818.0, "completions/max_terminated_length": 818.0, "completions/mean_length": 374.265625, "completions/mean_terminated_length": 374.265625, "completions/min_length": 150.0, "completions/min_terminated_length": 150.0, "epoch": 0.2061916173521788, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3085867400836263, "kl": 0.58251953125, "learning_rate": 1.932762177530064e-05, "loss": 0.0286, "num_tokens": 75741061.0, "reward": 0.799560546875, "reward_std": 0.2959206700325012, "rewards/accuracy_reward/mean": 0.55078125, "rewards/accuracy_reward/std": 0.49838894605636597, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1314.0, "completions/max_terminated_length": 1314.0, "completions/mean_length": 413.51171875, "completions/mean_terminated_length": 413.51171875, "completions/min_length": 175.0, "completions/min_terminated_length": 175.0, "epoch": 0.20658287279307477, "frac_reward_zero_std": 0.5, "grad_norm": 0.2565245744834371, "kl": 0.54833984375, "learning_rate": 1.932268907527065e-05, "loss": 0.0277, "num_tokens": 75876648.0, "reward": 0.842529296875, "reward_std": 0.21062573790550232, "rewards/accuracy_reward/mean": 0.59375, "rewards/accuracy_reward/std": 0.49209436774253845, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 714.0, "completions/max_terminated_length": 714.0, "completions/mean_length": 379.33984375, "completions/mean_terminated_length": 379.33984375, "completions/min_length": 145.0, "completions/min_terminated_length": 145.0, "epoch": 0.20697412823397077, "frac_reward_zero_std": 0.25, "grad_norm": 0.3547562791099002, "kl": 0.6142578125, "learning_rate": 1.9317738981811776e-05, "loss": 0.0205, "num_tokens": 76003855.0, "reward": 1.003173828125, "reward_std": 0.28776174783706665, "rewards/accuracy_reward/mean": 0.75390625, "rewards/accuracy_reward/std": 0.43157756328582764, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.03125, "step": 529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1022.0, "completions/max_terminated_length": 1022.0, "completions/mean_length": 408.59375, "completions/mean_terminated_length": 408.59375, "completions/min_length": 148.0, "completions/min_terminated_length": 148.0, "epoch": 0.20736538367486673, "frac_reward_zero_std": 0.375, "grad_norm": 0.3182845276662991, "kl": 0.5712890625, "learning_rate": 1.9312771504159448e-05, "loss": 0.0186, "num_tokens": 76138967.0, "reward": 0.8724365234375, "reward_std": 0.20210450887680054, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.984375, "rewards/format_reward/std": 0.12426253408193588, "rewards/tag_count_reward/mean": 0.9951171875, "rewards/tag_count_reward/std": 0.041130900382995605, "step": 530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 777.0, "completions/max_terminated_length": 777.0, "completions/mean_length": 362.171875, "completions/mean_terminated_length": 362.171875, "completions/min_length": 123.0, "completions/min_terminated_length": 123.0, "epoch": 0.2077566391157627, "frac_reward_zero_std": 0.3125, "grad_norm": 0.314680240940402, "kl": 0.5859375, "learning_rate": 1.930778665158154e-05, "loss": 0.031, "num_tokens": 76260851.0, "reward": 1.0528564453125, "reward_std": 0.2822665572166443, "rewards/accuracy_reward/mean": 0.8046875, "rewards/accuracy_reward/std": 0.39721766114234924, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 989.0, "completions/max_terminated_length": 989.0, "completions/mean_length": 385.140625, "completions/mean_terminated_length": 385.140625, "completions/min_length": 141.0, "completions/min_terminated_length": 141.0, "epoch": 0.2081478945566587, "frac_reward_zero_std": 0.25, "grad_norm": 0.3251558708482849, "kl": 0.6328125, "learning_rate": 1.9302784433378333e-05, "loss": 0.0084, "num_tokens": 76389431.0, "reward": 0.9375, "reward_std": 0.28343653678894043, "rewards/accuracy_reward/mean": 0.6875, "rewards/accuracy_reward/std": 0.4644203782081604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1404.0, "completions/max_terminated_length": 1404.0, "completions/mean_length": 375.87890625, "completions/mean_terminated_length": 375.87890625, "completions/min_length": 135.0, "completions/min_terminated_length": 135.0, "epoch": 0.20853914999755466, "frac_reward_zero_std": 0.375, "grad_norm": 0.29121906641655626, "kl": 0.615234375, "learning_rate": 1.9297764858882516e-05, "loss": 0.019, "num_tokens": 76513112.0, "reward": 0.9200439453125, "reward_std": 0.1917218714952469, "rewards/accuracy_reward/mean": 0.671875, "rewards/accuracy_reward/std": 0.47045037150382996, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1547.0, "completions/max_terminated_length": 1547.0, "completions/mean_length": 514.890625, "completions/mean_terminated_length": 514.890625, "completions/min_length": 189.0, "completions/min_terminated_length": 189.0, "epoch": 0.20893040543845062, "frac_reward_zero_std": 0.125, "grad_norm": 0.2922945394143476, "kl": 0.548828125, "learning_rate": 1.9292727937459155e-05, "loss": 0.0235, "num_tokens": 76674876.0, "reward": 0.8118896484375, "reward_std": 0.32928287982940674, "rewards/accuracy_reward/mean": 0.5625, "rewards/accuracy_reward/std": 0.49705013632774353, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 934.0, "completions/max_terminated_length": 934.0, "completions/mean_length": 376.57421875, "completions/mean_terminated_length": 376.57421875, "completions/min_length": 168.0, "completions/min_terminated_length": 168.0, "epoch": 0.20932166087934662, "frac_reward_zero_std": 0.4375, "grad_norm": 0.3328523622997377, "kl": 0.7509765625, "learning_rate": 1.9287673678505684e-05, "loss": 0.0119, "num_tokens": 76800031.0, "reward": 0.95654296875, "reward_std": 0.2092324197292328, "rewards/accuracy_reward/mean": 0.70703125, "rewards/accuracy_reward/std": 0.45601576566696167, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1578.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 440.83984375, "completions/mean_terminated_length": 440.83984375, "completions/min_length": 180.0, "completions/min_terminated_length": 180.0, "epoch": 0.20971291632024258, "frac_reward_zero_std": 0.3125, "grad_norm": 0.3191900541189801, "kl": 0.65283203125, "learning_rate": 1.928260209145188e-05, "loss": 0.0197, "num_tokens": 76939286.0, "reward": 0.834716796875, "reward_std": 0.27715057134628296, "rewards/accuracy_reward/mean": 0.5859375, "rewards/accuracy_reward/std": 0.4935242533683777, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 813.0, "completions/max_terminated_length": 813.0, "completions/mean_length": 382.18359375, "completions/mean_terminated_length": 382.18359375, "completions/min_length": 129.0, "completions/min_terminated_length": 129.0, "epoch": 0.21010417176113855, "frac_reward_zero_std": 0.4375, "grad_norm": 0.34442991301406606, "kl": 0.74609375, "learning_rate": 1.9277513185759847e-05, "loss": 0.0233, "num_tokens": 77066469.0, "reward": 1.0462646484375, "reward_std": 0.2123897522687912, "rewards/accuracy_reward/mean": 0.796875, "rewards/accuracy_reward/std": 0.40311288833618164, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1267.0, "completions/max_terminated_length": 1267.0, "completions/mean_length": 476.75, "completions/mean_terminated_length": 476.75, "completions/min_length": 181.0, "completions/min_terminated_length": 181.0, "epoch": 0.21049542720203454, "frac_reward_zero_std": 0.0625, "grad_norm": 0.3837377615389443, "kl": 0.6572265625, "learning_rate": 1.9272406970924013e-05, "loss": 0.0475, "num_tokens": 77216197.0, "reward": 0.779541015625, "reward_std": 0.39261358976364136, "rewards/accuracy_reward/mean": 0.53125, "rewards/accuracy_reward/std": 0.5, "rewards/format_reward/mean": 0.98828125, "rewards/format_reward/std": 0.1078278198838234, "rewards/tag_count_reward/mean": 0.998046875, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 970.0, "completions/max_terminated_length": 970.0, "completions/mean_length": 481.8125, "completions/mean_terminated_length": 481.8125, "completions/min_length": 242.0, "completions/min_terminated_length": 242.0, "epoch": 0.2108866826429305, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3577872581920302, "kl": 0.70068359375, "learning_rate": 1.926728345647108e-05, "loss": 0.0253, "num_tokens": 77371029.0, "reward": 0.8314208984375, "reward_std": 0.3402002155780792, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1189.0, "completions/max_terminated_length": 1189.0, "completions/mean_length": 506.30859375, "completions/mean_terminated_length": 506.30859375, "completions/min_length": 201.0, "completions/min_terminated_length": 201.0, "epoch": 0.21127793808382647, "frac_reward_zero_std": 0.25, "grad_norm": 0.3054033218165024, "kl": 0.5947265625, "learning_rate": 1.9262142651960048e-05, "loss": 0.0652, "num_tokens": 77531716.0, "reward": 0.91796875, "reward_std": 0.3244662880897522, "rewards/accuracy_reward/mean": 0.66796875, "rewards/accuracy_reward/std": 0.4718646705150604, "rewards/format_reward/mean": 1.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1130.0, "completions/max_terminated_length": 1130.0, "completions/mean_length": 522.1171875, "completions/mean_terminated_length": 522.1171875, "completions/min_length": 218.0, "completions/min_terminated_length": 218.0, "epoch": 0.21166919352472247, "frac_reward_zero_std": 0.125, "grad_norm": 0.36046149910611497, "kl": 0.64306640625, "learning_rate": 1.925698456698216e-05, "loss": 0.0206, "num_tokens": 77694114.0, "reward": 0.8626708984375, "reward_std": 0.3449711799621582, "rewards/accuracy_reward/mean": 0.61328125, "rewards/accuracy_reward/std": 0.4879522919654846, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 996.0, "completions/max_terminated_length": 996.0, "completions/mean_length": 486.3515625, "completions/mean_terminated_length": 486.3515625, "completions/min_length": 213.0, "completions/min_terminated_length": 213.0, "epoch": 0.21206044896561843, "frac_reward_zero_std": 0.25, "grad_norm": 0.3409238778377075, "kl": 0.6650390625, "learning_rate": 1.9251809211160905e-05, "loss": 0.0154, "num_tokens": 77849212.0, "reward": 0.89794921875, "reward_std": 0.3126068115234375, "rewards/accuracy_reward/mean": 0.6484375, "rewards/accuracy_reward/std": 0.47839346528053284, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 1.0, "rewards/tag_count_reward/std": 0.0, "step": 542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1064.0, "completions/max_terminated_length": 1064.0, "completions/mean_length": 487.65234375, "completions/mean_terminated_length": 487.65234375, "completions/min_length": 237.0, "completions/min_terminated_length": 237.0, "epoch": 0.2124517044065144, "frac_reward_zero_std": 0.375, "grad_norm": 0.4499338494993294, "kl": 0.7841796875, "learning_rate": 1.9246616594151986e-05, "loss": 0.0141, "num_tokens": 78003091.0, "reward": 0.7220458984375, "reward_std": 0.25242865085601807, "rewards/accuracy_reward/mean": 0.47265625, "rewards/accuracy_reward/std": 0.5002297759056091, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 825.0, "completions/max_terminated_length": 825.0, "completions/mean_length": 456.4375, "completions/mean_terminated_length": 456.4375, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.2128429598474104, "frac_reward_zero_std": 0.5, "grad_norm": 0.34630112014633313, "kl": 0.634765625, "learning_rate": 1.924140672564333e-05, "loss": 0.0055, "num_tokens": 78149315.0, "reward": 0.8314208984375, "reward_std": 0.17552721500396729, "rewards/accuracy_reward/mean": 0.58203125, "rewards/accuracy_reward/std": 0.49419113993644714, "rewards/format_reward/mean": 0.99609375, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 1012.0, "completions/max_terminated_length": 1012.0, "completions/mean_length": 453.80078125, "completions/mean_terminated_length": 453.80078125, "completions/min_length": 174.0, "completions/min_terminated_length": 174.0, "epoch": 0.21323421528830636, "frac_reward_zero_std": 0.1875, "grad_norm": 0.3679724986928, "kl": 0.7060546875, "learning_rate": 1.9236179615355026e-05, "loss": 0.0146, "num_tokens": 78293872.0, "reward": 0.8017578125, "reward_std": 0.3176271617412567, "rewards/accuracy_reward/mean": 0.5546875, "rewards/accuracy_reward/std": 0.49797385931015015, "rewards/format_reward/mean": 0.98046875, "rewards/format_reward/std": 0.13865381479263306, "rewards/tag_count_reward/mean": 0.99609375, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.6953125, "completions/max_length": 2048.0, "completions/max_terminated_length": 963.0, "completions/mean_length": 680.203125, "completions/mean_terminated_length": 434.37786865234375, "completions/min_length": 188.0, "completions/min_terminated_length": 188.0, "epoch": 0.21362547072920232, "frac_reward_zero_std": 0.0, "grad_norm": 2.2394167296829663, "kl": 0.8349609375, "learning_rate": 1.923093527303935e-05, "loss": 0.3734, "num_tokens": 78496996.0, "reward": 0.66748046875, "reward_std": 0.3245270252227783, "rewards/accuracy_reward/mean": 0.4453125, "rewards/accuracy_reward/std": 0.49797385931015015, "rewards/format_reward/mean": 0.828125, "rewards/format_reward/std": 0.3780108094215393, "rewards/tag_count_reward/mean": 0.94921875, "rewards/tag_count_reward/std": 0.13045687973499298, "step": 546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21401672617009831, "frac_reward_zero_std": 0.9375, "grad_norm": 2262.045732154886, "kl": 80.6396484375, "learning_rate": 1.922567370848072e-05, "loss": 0.8079, "num_tokens": 79050852.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21440798161099428, "frac_reward_zero_std": 1.0, "grad_norm": 0.31666411555042034, "kl": 0.63330078125, "learning_rate": 1.922039493149568e-05, "loss": 0.0063, "num_tokens": 79604692.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 780.0, "completions/max_terminated_length": 780.0, "completions/mean_length": 379.359375, "completions/mean_terminated_length": 379.359375, "completions/min_length": 182.0, "completions/min_terminated_length": 182.0, "epoch": 0.21479923705189025, "frac_reward_zero_std": 0.25, "grad_norm": 0.3211312534328214, "kl": 0.8095703125, "learning_rate": 1.9215098951932905e-05, "loss": 0.0258, "num_tokens": 79729392.0, "reward": 0.9127197265625, "reward_std": 0.306039035320282, "rewards/accuracy_reward/mean": 0.6640625, "rewards/accuracy_reward/std": 0.4732423722743988, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9970703125, "rewards/tag_count_reward/std": 0.03488371521234512, "step": 549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21519049249278624, "frac_reward_zero_std": 1.0, "grad_norm": 48.723232227376954, "kl": 1.3291015625, "learning_rate": 1.9209785779673153e-05, "loss": 0.0133, "num_tokens": 80285168.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -1.0, "completions/max_length": 811.0, "completions/max_terminated_length": 811.0, "completions/mean_length": 367.75390625, "completions/mean_terminated_length": 367.75390625, "completions/min_length": 153.0, "completions/min_terminated_length": 153.0, "epoch": 0.2155817479336822, "frac_reward_zero_std": 0.25, "grad_norm": 0.31180724952597505, "kl": 0.763671875, "learning_rate": 1.920445542462925e-05, "loss": 0.0161, "num_tokens": 80408257.0, "reward": 0.8739013671875, "reward_std": 0.3085157573223114, "rewards/accuracy_reward/mean": 0.625, "rewards/accuracy_reward/std": 0.4850712716579437, "rewards/format_reward/mean": 0.9921875, "rewards/format_reward/std": 0.08821486681699753, "rewards/tag_count_reward/mean": 0.9990234375, "rewards/tag_count_reward/std": 0.015625, "step": 551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21597300337457817, "frac_reward_zero_std": 1.0, "grad_norm": 1.224244705451843, "kl": 0.78173828125, "learning_rate": 1.9199107896746093e-05, "loss": 0.0078, "num_tokens": 80962593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": -0.890625, "completions/max_length": 2048.0, "completions/max_terminated_length": 771.0, "completions/mean_length": 434.67578125, "completions/mean_terminated_length": 341.34295654296875, "completions/min_length": 159.0, "completions/min_terminated_length": 159.0, "epoch": 0.21636425881547416, "frac_reward_zero_std": 0.1875, "grad_norm": 3773182.156924175, "kl": 220.671875, "learning_rate": 1.9193743206000618e-05, "loss": 2.3808, "num_tokens": 81102046.0, "reward": 0.7349853515625, "reward_std": 0.2801627218723297, "rewards/accuracy_reward/mean": 0.49609375, "rewards/accuracy_reward/std": 0.5009641647338867, "rewards/format_reward/mean": 0.93359375, "rewards/format_reward/std": 0.24947863817214966, "rewards/tag_count_reward/mean": 0.9775390625, "rewards/tag_count_reward/std": 0.09768597036600113, "step": 553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21675551425637013, "frac_reward_zero_std": 1.0, "grad_norm": 2.291278301812527, "kl": 0.5244140625, "learning_rate": 1.9188361362401777e-05, "loss": 0.0052, "num_tokens": 81657550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2171467696972661, "frac_reward_zero_std": 1.0, "grad_norm": 4.405202205542311, "kl": 0.3814697265625, "learning_rate": 1.918296237599053e-05, "loss": 0.0038, "num_tokens": 82212974.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2175380251381621, "frac_reward_zero_std": 1.0, "grad_norm": 0.12193232257075293, "kl": 0.296630859375, "learning_rate": 1.9177546256839814e-05, "loss": 0.003, "num_tokens": 82766254.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21792928057905805, "frac_reward_zero_std": 1.0, "grad_norm": 0.08100978478194218, "kl": 0.142822265625, "learning_rate": 1.917211301505453e-05, "loss": 0.0014, "num_tokens": 83322270.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21832053601995402, "frac_reward_zero_std": 1.0, "grad_norm": 68.63928387836748, "kl": 3.3427734375, "learning_rate": 1.9166662660771537e-05, "loss": 0.0334, "num_tokens": 83877454.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21871179146085, "frac_reward_zero_std": 1.0, "grad_norm": 0.11376614537830342, "kl": 0.65771484375, "learning_rate": 1.9161195204159604e-05, "loss": 0.0066, "num_tokens": 84430446.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21910304690174598, "frac_reward_zero_std": 1.0, "grad_norm": 0.22046440920393445, "kl": 0.1842041015625, "learning_rate": 1.915571065541942e-05, "loss": 0.0018, "num_tokens": 84983726.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21949430234264194, "frac_reward_zero_std": 1.0, "grad_norm": 0.016047408942418878, "kl": 0.082275390625, "learning_rate": 1.9150209024783564e-05, "loss": 0.0008, "num_tokens": 85537086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.21988555778353794, "frac_reward_zero_std": 1.0, "grad_norm": 0.024661224681170907, "kl": 0.0772705078125, "learning_rate": 1.914469032251647e-05, "loss": 0.0008, "num_tokens": 86089406.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2202768132244339, "frac_reward_zero_std": 1.0, "grad_norm": 0.03360161585132386, "kl": 0.05889892578125, "learning_rate": 1.9139154558914442e-05, "loss": 0.0006, "num_tokens": 86643710.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22066806866532987, "frac_reward_zero_std": 1.0, "grad_norm": 0.020281964386933662, "kl": 0.057281494140625, "learning_rate": 1.91336017443056e-05, "loss": 0.0006, "num_tokens": 87196334.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22105932410622586, "frac_reward_zero_std": 1.0, "grad_norm": 0.010965385218539415, "kl": 0.0535888671875, "learning_rate": 1.9128031889049886e-05, "loss": 0.0005, "num_tokens": 87748862.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22145057954712183, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036278425549142123, "kl": 0.044158935546875, "learning_rate": 1.9122445003539027e-05, "loss": 0.0004, "num_tokens": 88301582.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2218418349880178, "frac_reward_zero_std": 1.0, "grad_norm": 34.137636870546665, "kl": 0.053497314453125, "learning_rate": 1.9116841098196538e-05, "loss": 0.0005, "num_tokens": 88855198.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2222330904289138, "frac_reward_zero_std": 1.0, "grad_norm": 52.412388844981336, "kl": 0.23443603515625, "learning_rate": 1.9111220183477666e-05, "loss": 0.0023, "num_tokens": 89410670.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22262434586980975, "frac_reward_zero_std": 1.0, "grad_norm": 17185.897683720388, "kl": 33.240447998046875, "learning_rate": 1.9105582269869413e-05, "loss": 0.3327, "num_tokens": 89965438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22301560131070572, "frac_reward_zero_std": 1.0, "grad_norm": 0.013571019659245148, "kl": 0.05096435546875, "learning_rate": 1.9099927367890482e-05, "loss": 0.0005, "num_tokens": 90517214.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2234068567516017, "frac_reward_zero_std": 1.0, "grad_norm": 0.006511793453222357, "kl": 0.056304931640625, "learning_rate": 1.9094255488091282e-05, "loss": 0.0006, "num_tokens": 91070462.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22379811219249768, "frac_reward_zero_std": 1.0, "grad_norm": 0.004174307762894411, "kl": 0.055633544921875, "learning_rate": 1.9088566641053887e-05, "loss": 0.0006, "num_tokens": 91622190.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22418936763339364, "frac_reward_zero_std": 1.0, "grad_norm": 0.005012548837321789, "kl": 0.051727294921875, "learning_rate": 1.908286083739204e-05, "loss": 0.0005, "num_tokens": 92176894.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22458062307428964, "frac_reward_zero_std": 1.0, "grad_norm": 0.2800281387988427, "kl": 0.051910400390625, "learning_rate": 1.9077138087751104e-05, "loss": 0.0005, "num_tokens": 92729310.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2249718785151856, "frac_reward_zero_std": 1.0, "grad_norm": 0.005339553269448441, "kl": 0.051239013671875, "learning_rate": 1.9071398402808076e-05, "loss": 0.0005, "num_tokens": 93283806.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22536313395608157, "frac_reward_zero_std": 1.0, "grad_norm": 0.014406125615319234, "kl": 0.038482666015625, "learning_rate": 1.9065641793271534e-05, "loss": 0.0004, "num_tokens": 93839246.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22575438939697756, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016524672744512132, "kl": 0.0288238525390625, "learning_rate": 1.9059868269881637e-05, "loss": 0.0003, "num_tokens": 94393662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22614564483787353, "frac_reward_zero_std": 1.0, "grad_norm": 0.012337662742672312, "kl": 0.02935791015625, "learning_rate": 1.9054077843410108e-05, "loss": 0.0003, "num_tokens": 94946606.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2265369002787695, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008264856248422916, "kl": 0.022247314453125, "learning_rate": 1.9048270524660197e-05, "loss": 0.0002, "num_tokens": 95499710.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2269281557196655, "frac_reward_zero_std": 1.0, "grad_norm": 0.003330406687355578, "kl": 0.0200958251953125, "learning_rate": 1.9042446324466675e-05, "loss": 0.0002, "num_tokens": 96053358.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22731941116056145, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025947544748098966, "kl": 0.0187225341796875, "learning_rate": 1.9036605253695804e-05, "loss": 0.0002, "num_tokens": 96609390.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22771066660145742, "frac_reward_zero_std": 1.0, "grad_norm": 0.002397928770849876, "kl": 0.02032470703125, "learning_rate": 1.903074732324533e-05, "loss": 0.0002, "num_tokens": 97163934.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2281019220423534, "frac_reward_zero_std": 1.0, "grad_norm": 0.003022543129027659, "kl": 0.0211181640625, "learning_rate": 1.9024872544044442e-05, "loss": 0.0002, "num_tokens": 97717102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22849317748324938, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003637671579876899, "kl": 0.016448974609375, "learning_rate": 1.9018980927053777e-05, "loss": 0.0002, "num_tokens": 98270574.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22888443292414534, "frac_reward_zero_std": 1.0, "grad_norm": 0.001512168535254755, "kl": 0.0254364013671875, "learning_rate": 1.9013072483265377e-05, "loss": 0.0003, "num_tokens": 98824574.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.22927568836504134, "frac_reward_zero_std": 1.0, "grad_norm": 0.03039933686377209, "kl": 0.0302734375, "learning_rate": 1.9007147223702687e-05, "loss": 0.0003, "num_tokens": 99378302.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2296669438059373, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015243500086005316, "kl": 0.0312652587890625, "learning_rate": 1.9001205159420512e-05, "loss": 0.0003, "num_tokens": 99931982.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23005819924683327, "frac_reward_zero_std": 1.0, "grad_norm": 0.003875590396528243, "kl": 0.03216552734375, "learning_rate": 1.8995246301505023e-05, "loss": 0.0003, "num_tokens": 100487566.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23044945468772926, "frac_reward_zero_std": 1.0, "grad_norm": 2.975129884950075, "kl": 0.0305938720703125, "learning_rate": 1.898927066107371e-05, "loss": 0.0003, "num_tokens": 101041966.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23084071012862523, "frac_reward_zero_std": 1.0, "grad_norm": 0.02904487097609368, "kl": 0.019927978515625, "learning_rate": 1.8983278249275388e-05, "loss": 0.0002, "num_tokens": 101597102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2312319655695212, "frac_reward_zero_std": 1.0, "grad_norm": 49727.88034438709, "kl": 31.649673461914062, "learning_rate": 1.8977269077290158e-05, "loss": 0.3167, "num_tokens": 102151150.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23162322101041719, "frac_reward_zero_std": 1.0, "grad_norm": 0.003261720226093756, "kl": 0.032928466796875, "learning_rate": 1.897124315632938e-05, "loss": 0.0003, "num_tokens": 102705326.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23201447645131315, "frac_reward_zero_std": 1.0, "grad_norm": 0.0057521929083590995, "kl": 0.03631591796875, "learning_rate": 1.896520049763568e-05, "loss": 0.0004, "num_tokens": 103260926.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23240573189220912, "frac_reward_zero_std": 1.0, "grad_norm": 0.004850582732575166, "kl": 0.035675048828125, "learning_rate": 1.89591411124829e-05, "loss": 0.0004, "num_tokens": 103813022.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2327969873331051, "frac_reward_zero_std": 1.0, "grad_norm": 0.004001617871099224, "kl": 0.0394287109375, "learning_rate": 1.8953065012176096e-05, "loss": 0.0004, "num_tokens": 104367886.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23318824277400108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014551898987958157, "kl": 0.0362548828125, "learning_rate": 1.89469722080515e-05, "loss": 0.0004, "num_tokens": 104922158.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23357949821489704, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029683470605861876, "kl": 0.03668212890625, "learning_rate": 1.8940862711476515e-05, "loss": 0.0004, "num_tokens": 105477838.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23397075365579303, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030920857197187274, "kl": 0.0339508056640625, "learning_rate": 1.8934736533849686e-05, "loss": 0.0003, "num_tokens": 106032782.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.234362009096689, "frac_reward_zero_std": 1.0, "grad_norm": 0.008745044748541998, "kl": 0.0306549072265625, "learning_rate": 1.8928593686600683e-05, "loss": 0.0003, "num_tokens": 106587374.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23475326453758497, "frac_reward_zero_std": 1.0, "grad_norm": 0.003741231753767604, "kl": 0.030029296875, "learning_rate": 1.892243418119027e-05, "loss": 0.0003, "num_tokens": 107141854.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23514451997848096, "frac_reward_zero_std": 1.0, "grad_norm": 0.010897996283029524, "kl": 0.0260467529296875, "learning_rate": 1.8916258029110305e-05, "loss": 0.0003, "num_tokens": 107696094.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23553577541937692, "frac_reward_zero_std": 1.0, "grad_norm": 0.0063770639654334, "kl": 0.0263824462890625, "learning_rate": 1.891006524188368e-05, "loss": 0.0003, "num_tokens": 108249486.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2359270308602729, "frac_reward_zero_std": 1.0, "grad_norm": 0.008169437892753366, "kl": 0.023406982421875, "learning_rate": 1.890385583106434e-05, "loss": 0.0002, "num_tokens": 108803086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23631828630116888, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052803038845515, "kl": 0.0235443115234375, "learning_rate": 1.889762980823725e-05, "loss": 0.0002, "num_tokens": 109355918.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23670954174206485, "frac_reward_zero_std": 1.0, "grad_norm": 0.006040871170524045, "kl": 0.021240234375, "learning_rate": 1.8891387185018346e-05, "loss": 0.0002, "num_tokens": 109909550.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23710079718296082, "frac_reward_zero_std": 1.0, "grad_norm": 0.006246590580840887, "kl": 0.0212249755859375, "learning_rate": 1.8885127973054557e-05, "loss": 0.0002, "num_tokens": 110465086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2374920526238568, "frac_reward_zero_std": 1.0, "grad_norm": 0.0024639857129535735, "kl": 0.0260162353515625, "learning_rate": 1.8878852184023754e-05, "loss": 0.0003, "num_tokens": 111018670.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23788330806475277, "frac_reward_zero_std": 1.0, "grad_norm": 0.005890064300284762, "kl": 0.02252197265625, "learning_rate": 1.8872559829634732e-05, "loss": 0.0002, "num_tokens": 111572062.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23827456350564874, "frac_reward_zero_std": 1.0, "grad_norm": 0.00446425101623246, "kl": 0.019775390625, "learning_rate": 1.88662509216272e-05, "loss": 0.0002, "num_tokens": 112126846.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23866581894654473, "frac_reward_zero_std": 1.0, "grad_norm": 0.0033029098076310043, "kl": 0.024993896484375, "learning_rate": 1.885992547177174e-05, "loss": 0.0002, "num_tokens": 112681694.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2390570743874407, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010311957027477566, "kl": 0.0285186767578125, "learning_rate": 1.885358349186982e-05, "loss": 0.0003, "num_tokens": 113235438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23944832982833666, "frac_reward_zero_std": 1.0, "grad_norm": 0.004930404906464744, "kl": 0.0295562744140625, "learning_rate": 1.884722499375371e-05, "loss": 0.0003, "num_tokens": 113788206.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.23983958526923266, "frac_reward_zero_std": 1.0, "grad_norm": 0.001954870129672803, "kl": 0.03082275390625, "learning_rate": 1.8840849989286535e-05, "loss": 0.0003, "num_tokens": 114342430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24023084071012862, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025210854334399343, "kl": 0.0341033935546875, "learning_rate": 1.8834458490362192e-05, "loss": 0.0003, "num_tokens": 114895406.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2406220961510246, "frac_reward_zero_std": 1.0, "grad_norm": 0.017947534685228105, "kl": 0.0297393798828125, "learning_rate": 1.8828050508905367e-05, "loss": 0.0003, "num_tokens": 115449966.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24101335159192058, "frac_reward_zero_std": 1.0, "grad_norm": 0.0025873943786940065, "kl": 0.0292205810546875, "learning_rate": 1.8821626056871487e-05, "loss": 0.0003, "num_tokens": 116005150.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24140460703281655, "frac_reward_zero_std": 1.0, "grad_norm": 0.036401754367597954, "kl": 0.035858154296875, "learning_rate": 1.8815185146246718e-05, "loss": 0.0004, "num_tokens": 116559326.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24179586247371251, "frac_reward_zero_std": 1.0, "grad_norm": 0.006652637755284798, "kl": 0.0286865234375, "learning_rate": 1.8808727789047923e-05, "loss": 0.0003, "num_tokens": 117114654.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2421871179146085, "frac_reward_zero_std": 1.0, "grad_norm": 0.006119717081458333, "kl": 0.032623291015625, "learning_rate": 1.8802253997322656e-05, "loss": 0.0003, "num_tokens": 117668302.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24257837335550447, "frac_reward_zero_std": 1.0, "grad_norm": 0.016990632695088524, "kl": 0.03857421875, "learning_rate": 1.8795763783149132e-05, "loss": 0.0004, "num_tokens": 118221454.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24296962879640044, "frac_reward_zero_std": 1.0, "grad_norm": 0.11746223662786723, "kl": 0.03375244140625, "learning_rate": 1.8789257158636202e-05, "loss": 0.0003, "num_tokens": 118775198.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24336088423729643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011823771018919717, "kl": 0.0350494384765625, "learning_rate": 1.878273413592334e-05, "loss": 0.0004, "num_tokens": 119332558.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2437521396781924, "frac_reward_zero_std": 1.0, "grad_norm": 0.0034035287445415394, "kl": 0.03900146484375, "learning_rate": 1.877619472718061e-05, "loss": 0.0004, "num_tokens": 119887470.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24414339511908836, "frac_reward_zero_std": 1.0, "grad_norm": 0.04003189373953313, "kl": 0.0484619140625, "learning_rate": 1.8769638944608647e-05, "loss": 0.0005, "num_tokens": 120442254.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24453465055998436, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021203175791926236, "kl": 0.042510986328125, "learning_rate": 1.8763066800438638e-05, "loss": 0.0004, "num_tokens": 120996430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24492590600088032, "frac_reward_zero_std": 1.0, "grad_norm": 0.004873625641375505, "kl": 0.04638671875, "learning_rate": 1.8756478306932294e-05, "loss": 0.0005, "num_tokens": 121548958.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2453171614417763, "frac_reward_zero_std": 1.0, "grad_norm": 0.002132273184746582, "kl": 0.049072265625, "learning_rate": 1.8749873476381827e-05, "loss": 0.0005, "num_tokens": 122101214.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24570841688267228, "frac_reward_zero_std": 1.0, "grad_norm": 0.00239036193361179, "kl": 0.0499267578125, "learning_rate": 1.8743252321109935e-05, "loss": 0.0005, "num_tokens": 122654430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24609967232356825, "frac_reward_zero_std": 1.0, "grad_norm": 0.06025341236173664, "kl": 0.05462646484375, "learning_rate": 1.873661485346977e-05, "loss": 0.0005, "num_tokens": 123206142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2464909277644642, "frac_reward_zero_std": 1.0, "grad_norm": 0.012155963080812642, "kl": 0.05596923828125, "learning_rate": 1.8729961085844914e-05, "loss": 0.0006, "num_tokens": 123758990.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2468821832053602, "frac_reward_zero_std": 1.0, "grad_norm": 0.004462543773896214, "kl": 0.05560302734375, "learning_rate": 1.8723291030649368e-05, "loss": 0.0006, "num_tokens": 124313470.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24727343864625617, "frac_reward_zero_std": 1.0, "grad_norm": 0.004133847330667126, "kl": 0.059814453125, "learning_rate": 1.8716604700327516e-05, "loss": 0.0006, "num_tokens": 124867470.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24766469408715214, "frac_reward_zero_std": 1.0, "grad_norm": 0.004260465036864082, "kl": 0.06146240234375, "learning_rate": 1.87099021073541e-05, "loss": 0.0006, "num_tokens": 125422670.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24805594952804813, "frac_reward_zero_std": 1.0, "grad_norm": 0.03782656659535248, "kl": 0.064483642578125, "learning_rate": 1.870318326423423e-05, "loss": 0.0006, "num_tokens": 125974366.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2484472049689441, "frac_reward_zero_std": 1.0, "grad_norm": 0.014663668425820328, "kl": 0.06671142578125, "learning_rate": 1.8696448183503292e-05, "loss": 0.0007, "num_tokens": 126528670.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24883846040984006, "frac_reward_zero_std": 1.0, "grad_norm": 22.46814479943985, "kl": 0.12554931640625, "learning_rate": 1.8689696877727006e-05, "loss": 0.0013, "num_tokens": 127082046.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24922971585073606, "frac_reward_zero_std": 1.0, "grad_norm": 0.002109009505084801, "kl": 0.04962158203125, "learning_rate": 1.8682929359501338e-05, "loss": 0.0005, "num_tokens": 127635934.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.24962097129163202, "frac_reward_zero_std": 1.0, "grad_norm": 0.014721965881855066, "kl": 0.04144287109375, "learning_rate": 1.8676145641452517e-05, "loss": 0.0004, "num_tokens": 128189710.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.250012226732528, "frac_reward_zero_std": 1.0, "grad_norm": 0.0029183749069356447, "kl": 0.03887939453125, "learning_rate": 1.8669345736236983e-05, "loss": 0.0004, "num_tokens": 128742158.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25040348217342395, "frac_reward_zero_std": 1.0, "grad_norm": 0.002832476452744704, "kl": 0.03143310546875, "learning_rate": 1.866252965654139e-05, "loss": 0.0003, "num_tokens": 129296126.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25079473761432, "frac_reward_zero_std": 1.0, "grad_norm": 0.007177655333394468, "kl": 0.0290985107421875, "learning_rate": 1.8655697415082556e-05, "loss": 0.0003, "num_tokens": 129849070.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25118599305521594, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014457408311877103, "kl": 0.024078369140625, "learning_rate": 1.864884902460746e-05, "loss": 0.0002, "num_tokens": 130403646.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2515772484961119, "frac_reward_zero_std": 1.0, "grad_norm": 0.00390975275712181, "kl": 0.02044677734375, "learning_rate": 1.8641984497893212e-05, "loss": 0.0002, "num_tokens": 130958030.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25196850393700787, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012488347760967861, "kl": 0.0184478759765625, "learning_rate": 1.8635103847747022e-05, "loss": 0.0002, "num_tokens": 131512590.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25235975937790384, "frac_reward_zero_std": 1.0, "grad_norm": 0.006268421356473371, "kl": 0.0194091796875, "learning_rate": 1.8628207087006186e-05, "loss": 0.0002, "num_tokens": 132064526.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2527510148187998, "frac_reward_zero_std": 1.0, "grad_norm": 0.004536328818846904, "kl": 0.016845703125, "learning_rate": 1.862129422853805e-05, "loss": 0.0002, "num_tokens": 132620798.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2531422702596958, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013174109609621284, "kl": 0.0166778564453125, "learning_rate": 1.8614365285240002e-05, "loss": 0.0002, "num_tokens": 133176046.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2535335257005918, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016307465000330746, "kl": 0.0174102783203125, "learning_rate": 1.860742027003944e-05, "loss": 0.0002, "num_tokens": 133729070.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25392478114148775, "frac_reward_zero_std": 1.0, "grad_norm": 0.003002114220401319, "kl": 0.0180511474609375, "learning_rate": 1.8600459195893737e-05, "loss": 0.0002, "num_tokens": 134282750.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2543160365823837, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017980568641742228, "kl": 0.017852783203125, "learning_rate": 1.8593482075790243e-05, "loss": 0.0002, "num_tokens": 134836702.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2547072920232797, "frac_reward_zero_std": 1.0, "grad_norm": 0.004050331744308951, "kl": 0.0162506103515625, "learning_rate": 1.8586488922746234e-05, "loss": 0.0002, "num_tokens": 135391230.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25509854746417565, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009428841996810199, "kl": 0.0180206298828125, "learning_rate": 1.8579479749808896e-05, "loss": 0.0002, "num_tokens": 135944430.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2554898029050717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0030305851519795263, "kl": 0.0173797607421875, "learning_rate": 1.857245457005532e-05, "loss": 0.0002, "num_tokens": 136498206.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25588105834596764, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019001271303059738, "kl": 0.0206146240234375, "learning_rate": 1.856541339659244e-05, "loss": 0.0002, "num_tokens": 137052062.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2562723137868636, "frac_reward_zero_std": 1.0, "grad_norm": 0.004244064210147885, "kl": 0.019073486328125, "learning_rate": 1.8558356242557044e-05, "loss": 0.0002, "num_tokens": 137605534.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25666356922775957, "frac_reward_zero_std": 1.0, "grad_norm": 0.006043842484081931, "kl": 0.0197906494140625, "learning_rate": 1.855128312111573e-05, "loss": 0.0002, "num_tokens": 138161086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25705482466865553, "frac_reward_zero_std": 1.0, "grad_norm": 0.0026715472186904917, "kl": 0.0199737548828125, "learning_rate": 1.8544194045464888e-05, "loss": 0.0002, "num_tokens": 138715662.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2574460801095515, "frac_reward_zero_std": 1.0, "grad_norm": 0.011010547327292137, "kl": 0.0216827392578125, "learning_rate": 1.8537089028830673e-05, "loss": 0.0002, "num_tokens": 139269598.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2578373355504475, "frac_reward_zero_std": 1.0, "grad_norm": 0.003086127478331508, "kl": 0.0224761962890625, "learning_rate": 1.852996808446898e-05, "loss": 0.0002, "num_tokens": 139822878.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2582285909913435, "frac_reward_zero_std": 1.0, "grad_norm": 0.13258446188353365, "kl": 0.0546417236328125, "learning_rate": 1.8522831225665423e-05, "loss": 0.0005, "num_tokens": 140374686.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25861984643223945, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008535124945819711, "kl": 0.0214385986328125, "learning_rate": 1.851567846573531e-05, "loss": 0.0002, "num_tokens": 140928366.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2590111018731354, "frac_reward_zero_std": 1.0, "grad_norm": 0.001147594799863453, "kl": 0.0218353271484375, "learning_rate": 1.850850981802361e-05, "loss": 0.0002, "num_tokens": 141483262.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2594023573140314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0023606036813130877, "kl": 0.0227508544921875, "learning_rate": 1.8501325295904933e-05, "loss": 0.0002, "num_tokens": 142037790.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.25979361275492735, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020416245114488808, "kl": 0.024322509765625, "learning_rate": 1.8494124912783516e-05, "loss": 0.0002, "num_tokens": 142593198.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26018486819582337, "frac_reward_zero_std": 1.0, "grad_norm": 0.04879182582901119, "kl": 0.0313720703125, "learning_rate": 1.8486908682093175e-05, "loss": 0.0003, "num_tokens": 143147822.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26057612363671934, "frac_reward_zero_std": 1.0, "grad_norm": 0.007489061490447796, "kl": 0.0295867919921875, "learning_rate": 1.8479676617297303e-05, "loss": 0.0003, "num_tokens": 143700654.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2609673790776153, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038708957664404923, "kl": 0.034210205078125, "learning_rate": 1.8472428731888836e-05, "loss": 0.0003, "num_tokens": 144254142.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26135863451851127, "frac_reward_zero_std": 1.0, "grad_norm": 0.006095190344426038, "kl": 0.03656005859375, "learning_rate": 1.8465165039390215e-05, "loss": 0.0004, "num_tokens": 144807982.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26174988995940723, "frac_reward_zero_std": 1.0, "grad_norm": 0.017301973356579046, "kl": 0.03753662109375, "learning_rate": 1.8457885553353384e-05, "loss": 0.0004, "num_tokens": 145362574.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2621411454003032, "frac_reward_zero_std": 1.0, "grad_norm": 0.018367671039557307, "kl": 0.0418701171875, "learning_rate": 1.8450590287359748e-05, "loss": 0.0004, "num_tokens": 145917246.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2625324008411992, "frac_reward_zero_std": 1.0, "grad_norm": 0.014656757048914896, "kl": 0.041412353515625, "learning_rate": 1.8443279255020153e-05, "loss": 0.0004, "num_tokens": 146472478.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2629236562820952, "frac_reward_zero_std": 1.0, "grad_norm": 0.011388703463023604, "kl": 0.04345703125, "learning_rate": 1.8435952469974858e-05, "loss": 0.0004, "num_tokens": 147025518.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26331491172299115, "frac_reward_zero_std": 1.0, "grad_norm": 0.010610621031747713, "kl": 0.0452880859375, "learning_rate": 1.842860994589352e-05, "loss": 0.0005, "num_tokens": 147579166.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2637061671638871, "frac_reward_zero_std": 1.0, "grad_norm": 0.008183832009915213, "kl": 0.0440673828125, "learning_rate": 1.842125169647515e-05, "loss": 0.0004, "num_tokens": 148132574.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2640974226047831, "frac_reward_zero_std": 1.0, "grad_norm": 0.006460544773506325, "kl": 0.042999267578125, "learning_rate": 1.841387773544811e-05, "loss": 0.0004, "num_tokens": 148686494.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26448867804567905, "frac_reward_zero_std": 1.0, "grad_norm": 0.025284946960866035, "kl": 0.042083740234375, "learning_rate": 1.8406488076570062e-05, "loss": 0.0004, "num_tokens": 149241422.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26487993348657507, "frac_reward_zero_std": 1.0, "grad_norm": 0.04892378329534301, "kl": 0.0423583984375, "learning_rate": 1.8399082733627967e-05, "loss": 0.0004, "num_tokens": 149795022.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26527118892747104, "frac_reward_zero_std": 1.0, "grad_norm": 0.003229424003461458, "kl": 0.044586181640625, "learning_rate": 1.8391661720438038e-05, "loss": 0.0004, "num_tokens": 150348238.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.265662444368367, "frac_reward_zero_std": 1.0, "grad_norm": 0.6004268456175591, "kl": 0.187255859375, "learning_rate": 1.8384225050845735e-05, "loss": 0.0019, "num_tokens": 150900254.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26605369980926297, "frac_reward_zero_std": 1.0, "grad_norm": 0.021986875552588863, "kl": 0.06341552734375, "learning_rate": 1.837677273872572e-05, "loss": 0.0006, "num_tokens": 151453438.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26644495525015893, "frac_reward_zero_std": 1.0, "grad_norm": 0.12341644151480041, "kl": 0.039276123046875, "learning_rate": 1.8369304797981843e-05, "loss": 0.0004, "num_tokens": 152007150.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2668362106910549, "frac_reward_zero_std": 1.0, "grad_norm": 0.012386732380622562, "kl": 0.058624267578125, "learning_rate": 1.836182124254711e-05, "loss": 0.0006, "num_tokens": 152561086.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2672274661319509, "frac_reward_zero_std": 1.0, "grad_norm": 0.09178048326067578, "kl": 0.0838623046875, "learning_rate": 1.8354322086383664e-05, "loss": 0.0008, "num_tokens": 153114014.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2676187215728469, "frac_reward_zero_std": 1.0, "grad_norm": 343103.261330877, "kl": 5313.609375, "learning_rate": 1.8346807343482744e-05, "loss": 53.0172, "num_tokens": 153667278.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26800997701374285, "frac_reward_zero_std": 1.0, "grad_norm": 552745.7950882558, "kl": 1735.5400390625, "learning_rate": 1.8339277027864683e-05, "loss": 17.3291, "num_tokens": 154222462.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2684012324546388, "frac_reward_zero_std": 1.0, "grad_norm": 5307107.957990737, "kl": 40854.126953125, "learning_rate": 1.833173115357886e-05, "loss": 408.4406, "num_tokens": 154775326.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2687924878955348, "frac_reward_zero_std": 1.0, "grad_norm": 144762.81348018695, "kl": 413.580078125, "learning_rate": 1.832416973470368e-05, "loss": 4.1453, "num_tokens": 155327230.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26918374333643075, "frac_reward_zero_std": 1.0, "grad_norm": 12176210898.529053, "kl": 20034913.8984375, "learning_rate": 1.8316592785346565e-05, "loss": 200361.0312, "num_tokens": 155880702.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26957499877732677, "frac_reward_zero_std": 1.0, "grad_norm": 209649313.39101872, "kl": 885713.81640625, "learning_rate": 1.8309000319643892e-05, "loss": 8860.0273, "num_tokens": 156434158.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.26996625421822273, "frac_reward_zero_std": 1.0, "grad_norm": 19245983398.893326, "kl": 42738964.0, "learning_rate": 1.8301392351760994e-05, "loss": 426780.7812, "num_tokens": 156987742.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2703575096591187, "frac_reward_zero_std": 1.0, "grad_norm": 2905156.826298793, "kl": 2079.7021484375, "learning_rate": 1.8293768895892135e-05, "loss": 20.8339, "num_tokens": 157539934.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27074876510001467, "frac_reward_zero_std": 1.0, "grad_norm": 44017767658.74962, "kl": 193147863.70703125, "learning_rate": 1.828612996626046e-05, "loss": 1935409.5, "num_tokens": 158092590.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27114002054091063, "frac_reward_zero_std": 1.0, "grad_norm": 367164404.26703227, "kl": 12388387.435546875, "learning_rate": 1.8278475577118004e-05, "loss": 124230.875, "num_tokens": 158649566.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2715312759818066, "frac_reward_zero_std": 1.0, "grad_norm": 22165433.65507606, "kl": 79233.3095703125, "learning_rate": 1.827080574274562e-05, "loss": 793.034, "num_tokens": 159205342.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2719225314227026, "frac_reward_zero_std": 1.0, "grad_norm": 23.670345940933387, "kl": 1.7861328125, "learning_rate": 1.8263120477453e-05, "loss": 0.0178, "num_tokens": 159759886.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2723137868635986, "frac_reward_zero_std": 1.0, "grad_norm": 9556.444065295404, "kl": 1240.7392578125, "learning_rate": 1.825541979557861e-05, "loss": 12.4282, "num_tokens": 160315102.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 987.0, "completions/mean_length": 2043.85546875, "completions/mean_terminated_length": 987.0, "completions/min_length": 987.0, "completions/min_terminated_length": 987.0, "epoch": 0.27270504230449455, "frac_reward_zero_std": 1.0, "grad_norm": 1922518898.160928, "kl": 3540473.7529296875, "learning_rate": 1.8247703711489684e-05, "loss": 35343.2695, "num_tokens": 160866185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2730962977453905, "frac_reward_zero_std": 1.0, "grad_norm": 15647.148956732923, "kl": 37.353515625, "learning_rate": 1.8239972239582203e-05, "loss": 0.3728, "num_tokens": 161419705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2734875531862865, "frac_reward_zero_std": 1.0, "grad_norm": 2751.329562558941, "kl": 12.4892578125, "learning_rate": 1.8232225394280836e-05, "loss": 0.1248, "num_tokens": 161973257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27387880862718245, "frac_reward_zero_std": 1.0, "grad_norm": 12.69054049818898, "kl": 2.56640625, "learning_rate": 1.822446319003895e-05, "loss": 0.0257, "num_tokens": 162525497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27427006406807847, "frac_reward_zero_std": 1.0, "grad_norm": 0.882105274701228, "kl": 2.689453125, "learning_rate": 1.8216685641338562e-05, "loss": 0.0269, "num_tokens": 163078729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27466131950897443, "frac_reward_zero_std": 1.0, "grad_norm": 0.5184885303804918, "kl": 2.140625, "learning_rate": 1.820889276269032e-05, "loss": 0.0214, "num_tokens": 163631161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2750525749498704, "frac_reward_zero_std": 1.0, "grad_norm": 0.198686756852265, "kl": 2.40625, "learning_rate": 1.8201084568633462e-05, "loss": 0.0241, "num_tokens": 164185161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27544383039076636, "frac_reward_zero_std": 1.0, "grad_norm": 0.11942513677664823, "kl": 0.59814453125, "learning_rate": 1.819326107373582e-05, "loss": 0.006, "num_tokens": 164739657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27583508583166233, "frac_reward_zero_std": 1.0, "grad_norm": 88.70085140446412, "kl": 1.29833984375, "learning_rate": 1.818542229259376e-05, "loss": 0.0129, "num_tokens": 165294425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2762263412725583, "frac_reward_zero_std": 1.0, "grad_norm": 0.02679835365138112, "kl": 0.1695556640625, "learning_rate": 1.8177568239832167e-05, "loss": 0.0017, "num_tokens": 165848089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2766175967134543, "frac_reward_zero_std": 1.0, "grad_norm": 0.02852058116097262, "kl": 0.1466064453125, "learning_rate": 1.816969893010442e-05, "loss": 0.0015, "num_tokens": 166401849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2770088521543503, "frac_reward_zero_std": 1.0, "grad_norm": 0.032633187008511096, "kl": 0.1429443359375, "learning_rate": 1.816181437809237e-05, "loss": 0.0014, "num_tokens": 166957385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27740010759524625, "frac_reward_zero_std": 1.0, "grad_norm": 0.03618955453341427, "kl": 0.13525390625, "learning_rate": 1.81539145985063e-05, "loss": 0.0014, "num_tokens": 167510665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2777913630361422, "frac_reward_zero_std": 1.0, "grad_norm": 0.039466000946830976, "kl": 0.11517333984375, "learning_rate": 1.814599960608489e-05, "loss": 0.0012, "num_tokens": 168063801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2781826184770382, "frac_reward_zero_std": 1.0, "grad_norm": 0.029764445075966428, "kl": 0.10272216796875, "learning_rate": 1.8138069415595235e-05, "loss": 0.001, "num_tokens": 168616393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27857387391793414, "frac_reward_zero_std": 1.0, "grad_norm": 0.023552575100031746, "kl": 0.0948486328125, "learning_rate": 1.813012404183275e-05, "loss": 0.0009, "num_tokens": 169169449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27896512935883017, "frac_reward_zero_std": 1.0, "grad_norm": 0.020961624445482087, "kl": 0.08538818359375, "learning_rate": 1.8122163499621208e-05, "loss": 0.0009, "num_tokens": 169721625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.27935638479972613, "frac_reward_zero_std": 1.0, "grad_norm": 0.0217621579214259, "kl": 0.07781982421875, "learning_rate": 1.811418780381266e-05, "loss": 0.0008, "num_tokens": 170275481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2797476402406221, "frac_reward_zero_std": 1.0, "grad_norm": 0.06727796787119458, "kl": 0.09722900390625, "learning_rate": 1.8106196969287434e-05, "loss": 0.001, "num_tokens": 170829865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28013889568151806, "frac_reward_zero_std": 1.0, "grad_norm": 0.037946828965989554, "kl": 0.08197021484375, "learning_rate": 1.809819101095411e-05, "loss": 0.0008, "num_tokens": 171383993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28053015112241403, "frac_reward_zero_std": 1.0, "grad_norm": 0.05770005490696138, "kl": 0.07928466796875, "learning_rate": 1.8090169943749477e-05, "loss": 0.0008, "num_tokens": 171940345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28092140656331, "frac_reward_zero_std": 1.0, "grad_norm": 0.10331397364332107, "kl": 0.08837890625, "learning_rate": 1.808213378263852e-05, "loss": 0.0009, "num_tokens": 172492249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.281312662004206, "frac_reward_zero_std": 1.0, "grad_norm": 0.06201511892437819, "kl": 0.08416748046875, "learning_rate": 1.8074082542614376e-05, "loss": 0.0008, "num_tokens": 173045161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.281703917445102, "frac_reward_zero_std": 1.0, "grad_norm": 0.03523088469400418, "kl": 0.0723876953125, "learning_rate": 1.806601623869832e-05, "loss": 0.0007, "num_tokens": 173598841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28209517288599795, "frac_reward_zero_std": 1.0, "grad_norm": 0.034211190107108325, "kl": 0.06640625, "learning_rate": 1.8057934885939734e-05, "loss": 0.0007, "num_tokens": 174151993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2824864283268939, "frac_reward_zero_std": 1.0, "grad_norm": 0.025324218505719406, "kl": 0.068634033203125, "learning_rate": 1.804983849941607e-05, "loss": 0.0007, "num_tokens": 174707001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2828776837677899, "frac_reward_zero_std": 1.0, "grad_norm": 0.04507058167253771, "kl": 0.068634033203125, "learning_rate": 1.804172709423284e-05, "loss": 0.0007, "num_tokens": 175260937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28326893920868584, "frac_reward_zero_std": 1.0, "grad_norm": 0.019464088630310624, "kl": 0.061279296875, "learning_rate": 1.803360068552356e-05, "loss": 0.0006, "num_tokens": 175814409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28366019464958186, "frac_reward_zero_std": 1.0, "grad_norm": 0.02925416219881808, "kl": 0.058197021484375, "learning_rate": 1.8025459288449755e-05, "loss": 0.0006, "num_tokens": 176367497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28405145009047783, "frac_reward_zero_std": 1.0, "grad_norm": 0.004653175366620131, "kl": 0.04632568359375, "learning_rate": 1.80173029182009e-05, "loss": 0.0005, "num_tokens": 176920345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2844427055313738, "frac_reward_zero_std": 1.0, "grad_norm": 0.015215559632092512, "kl": 0.049072265625, "learning_rate": 1.8009131589994418e-05, "loss": 0.0005, "num_tokens": 177474265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28483396097226976, "frac_reward_zero_std": 1.0, "grad_norm": 0.006912379763823275, "kl": 0.043670654296875, "learning_rate": 1.800094531907563e-05, "loss": 0.0004, "num_tokens": 178028329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2852252164131657, "frac_reward_zero_std": 1.0, "grad_norm": 0.014155830432535516, "kl": 0.048095703125, "learning_rate": 1.7992744120717735e-05, "loss": 0.0005, "num_tokens": 178581641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2856164718540617, "frac_reward_zero_std": 1.0, "grad_norm": 0.012935454708421796, "kl": 0.05023193359375, "learning_rate": 1.798452801022179e-05, "loss": 0.0005, "num_tokens": 179134217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2860077272949577, "frac_reward_zero_std": 1.0, "grad_norm": 0.028153505454097054, "kl": 0.063873291015625, "learning_rate": 1.7976297002916674e-05, "loss": 0.0006, "num_tokens": 179688185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2863989827358537, "frac_reward_zero_std": 1.0, "grad_norm": 0.04447706271514448, "kl": 0.0721435546875, "learning_rate": 1.7968051114159046e-05, "loss": 0.0007, "num_tokens": 180242057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28679023817674965, "frac_reward_zero_std": 1.0, "grad_norm": 0.016493392341001546, "kl": 0.095458984375, "learning_rate": 1.7959790359333348e-05, "loss": 0.001, "num_tokens": 180795481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2871814936176456, "frac_reward_zero_std": 1.0, "grad_norm": 0.01337942902316732, "kl": 0.126708984375, "learning_rate": 1.7951514753851738e-05, "loss": 0.0013, "num_tokens": 181350281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2875727490585416, "frac_reward_zero_std": 1.0, "grad_norm": 26.0208763025705, "kl": 0.1812744140625, "learning_rate": 1.7943224313154098e-05, "loss": 0.0018, "num_tokens": 181902345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2879640044994376, "frac_reward_zero_std": 1.0, "grad_norm": 0.04902142401109428, "kl": 0.2252197265625, "learning_rate": 1.793491905270798e-05, "loss": 0.0023, "num_tokens": 182456841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28835525994033356, "frac_reward_zero_std": 1.0, "grad_norm": 0.07232549545066044, "kl": 0.32373046875, "learning_rate": 1.7926598988008584e-05, "loss": 0.0032, "num_tokens": 183009657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28874651538122953, "frac_reward_zero_std": 1.0, "grad_norm": 21103270352.635437, "kl": 3019898886.320801, "learning_rate": 1.791826413457874e-05, "loss": 30146560.0, "num_tokens": 183564825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2891377708221255, "frac_reward_zero_std": 1.0, "grad_norm": 403.2319806158805, "kl": 6.7939453125, "learning_rate": 1.7909914507968855e-05, "loss": 0.0681, "num_tokens": 184117801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.28952902626302146, "frac_reward_zero_std": 1.0, "grad_norm": 11.485625977319822, "kl": 3.36328125, "learning_rate": 1.7901550123756906e-05, "loss": 0.0336, "num_tokens": 184672905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2899202817039174, "frac_reward_zero_std": 1.0, "grad_norm": 544.7056842540377, "kl": 3.625, "learning_rate": 1.789317099754841e-05, "loss": 0.0362, "num_tokens": 185225913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29031153714481345, "frac_reward_zero_std": 1.0, "grad_norm": 16450.260042086295, "kl": 38.0185546875, "learning_rate": 1.7884777144976376e-05, "loss": 0.3801, "num_tokens": 185778841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2907027925857094, "frac_reward_zero_std": 1.0, "grad_norm": 0.1626430468036336, "kl": 1.818359375, "learning_rate": 1.78763685817013e-05, "loss": 0.0182, "num_tokens": 186332537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2910940480266054, "frac_reward_zero_std": 1.0, "grad_norm": 0.06910423552599747, "kl": 1.6904296875, "learning_rate": 1.786794532341111e-05, "loss": 0.0169, "num_tokens": 186887769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29148530346750134, "frac_reward_zero_std": 1.0, "grad_norm": 4684.70625341371, "kl": 18.8505859375, "learning_rate": 1.7859507385821163e-05, "loss": 0.1883, "num_tokens": 187439673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2918765589083973, "frac_reward_zero_std": 1.0, "grad_norm": 1.5919224109217522, "kl": 2.974609375, "learning_rate": 1.78510547846742e-05, "loss": 0.0297, "num_tokens": 187994761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2922678143492933, "frac_reward_zero_std": 1.0, "grad_norm": 64.93791594049992, "kl": 2.607421875, "learning_rate": 1.7842587535740315e-05, "loss": 0.0261, "num_tokens": 188547689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2926590697901893, "frac_reward_zero_std": 1.0, "grad_norm": 0.7573323130784033, "kl": 2.50390625, "learning_rate": 1.7834105654816936e-05, "loss": 0.025, "num_tokens": 189100825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29305032523108526, "frac_reward_zero_std": 1.0, "grad_norm": 0.15757290028795595, "kl": 2.51171875, "learning_rate": 1.7825609157728786e-05, "loss": 0.0251, "num_tokens": 189654921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29344158067198123, "frac_reward_zero_std": 1.0, "grad_norm": 0.7333307366083949, "kl": 2.078125, "learning_rate": 1.7817098060327865e-05, "loss": 0.0208, "num_tokens": 190208777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2938328361128772, "frac_reward_zero_std": 1.0, "grad_norm": 0.18188127480532076, "kl": 2.130859375, "learning_rate": 1.7808572378493404e-05, "loss": 0.0213, "num_tokens": 190761561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29422409155377316, "frac_reward_zero_std": 1.0, "grad_norm": 0.16054904240350815, "kl": 2.0126953125, "learning_rate": 1.7800032128131846e-05, "loss": 0.0201, "num_tokens": 191316105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2946153469946691, "frac_reward_zero_std": 1.0, "grad_norm": 1.789809203496752, "kl": 2.0078125, "learning_rate": 1.7791477325176824e-05, "loss": 0.0201, "num_tokens": 191868409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29500660243556515, "frac_reward_zero_std": 1.0, "grad_norm": 7.590315840162404, "kl": 1.724609375, "learning_rate": 1.7782907985589107e-05, "loss": 0.0173, "num_tokens": 192421529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2953978578764611, "frac_reward_zero_std": 1.0, "grad_norm": 0.10239781845453964, "kl": 1.7705078125, "learning_rate": 1.7774324125356593e-05, "loss": 0.0177, "num_tokens": 192974905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2957891133173571, "frac_reward_zero_std": 1.0, "grad_norm": 0.1008606300319727, "kl": 1.873046875, "learning_rate": 1.776572576049427e-05, "loss": 0.0187, "num_tokens": 193529433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29618036875825304, "frac_reward_zero_std": 1.0, "grad_norm": 0.11563736727612578, "kl": 1.78125, "learning_rate": 1.77571129070442e-05, "loss": 0.0178, "num_tokens": 194082409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.296571624199149, "frac_reward_zero_std": 1.0, "grad_norm": 0.12519633070165798, "kl": 1.3232421875, "learning_rate": 1.774848558107545e-05, "loss": 0.0132, "num_tokens": 194634201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.296962879640045, "frac_reward_zero_std": 1.0, "grad_norm": 0.0481324485693073, "kl": 0.7470703125, "learning_rate": 1.7739843798684114e-05, "loss": 0.0075, "num_tokens": 195186921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.297354135080941, "frac_reward_zero_std": 1.0, "grad_norm": 0.051096020921216276, "kl": 0.3212890625, "learning_rate": 1.773118757599324e-05, "loss": 0.0032, "num_tokens": 195741513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29774539052183696, "frac_reward_zero_std": 1.0, "grad_norm": 0.03889165630348748, "kl": 0.189697265625, "learning_rate": 1.7722516929152828e-05, "loss": 0.0019, "num_tokens": 196295977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2981366459627329, "frac_reward_zero_std": 1.0, "grad_norm": 0.04247095940872503, "kl": 0.1405029296875, "learning_rate": 1.771383187433978e-05, "loss": 0.0014, "num_tokens": 196848793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2985279014036289, "frac_reward_zero_std": 1.0, "grad_norm": 0.04802447807799876, "kl": 0.10418701171875, "learning_rate": 1.7705132427757895e-05, "loss": 0.001, "num_tokens": 197401049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29891915684452486, "frac_reward_zero_std": 1.0, "grad_norm": 0.05715050950875226, "kl": 0.0784912109375, "learning_rate": 1.7696418605637805e-05, "loss": 0.0008, "num_tokens": 197954121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.2993104122854208, "frac_reward_zero_std": 1.0, "grad_norm": 0.057578282858344335, "kl": 0.07684326171875, "learning_rate": 1.7687690424236966e-05, "loss": 0.0008, "num_tokens": 198507017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.29970166772631684, "frac_reward_zero_std": 1.0, "grad_norm": 0.02227440896186004, "kl": 0.05694580078125, "learning_rate": 1.767894789983964e-05, "loss": 0.0006, "num_tokens": 199060217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3000929231672128, "frac_reward_zero_std": 1.0, "grad_norm": 0.009825716352711568, "kl": 0.047027587890625, "learning_rate": 1.7670191048756827e-05, "loss": 0.0005, "num_tokens": 199612809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3004841786081088, "frac_reward_zero_std": 1.0, "grad_norm": 0.006770046916364082, "kl": 0.03875732421875, "learning_rate": 1.7661419887326276e-05, "loss": 0.0004, "num_tokens": 200168585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30087543404900474, "frac_reward_zero_std": 1.0, "grad_norm": 0.003645983120237152, "kl": 0.0279541015625, "learning_rate": 1.7652634431912417e-05, "loss": 0.0003, "num_tokens": 200725929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3012666894899007, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068368217926477425, "kl": 0.030059814453125, "learning_rate": 1.7643834698906358e-05, "loss": 0.0003, "num_tokens": 201280521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3016579449307967, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009451387724655165, "kl": 0.0251312255859375, "learning_rate": 1.763502070472585e-05, "loss": 0.0003, "num_tokens": 201831913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3020492003716927, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007389625912132871, "kl": 0.0223846435546875, "learning_rate": 1.762619246581524e-05, "loss": 0.0002, "num_tokens": 202386873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30244045581258866, "frac_reward_zero_std": 1.0, "grad_norm": 0.0005188965912496684, "kl": 0.0200347900390625, "learning_rate": 1.7617349998645457e-05, "loss": 0.0002, "num_tokens": 202941641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3028317112534846, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007985677115344587, "kl": 0.0195465087890625, "learning_rate": 1.760849331971398e-05, "loss": 0.0002, "num_tokens": 203495769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3032229666943806, "frac_reward_zero_std": 1.0, "grad_norm": 0.00043249647835659624, "kl": 0.018157958984375, "learning_rate": 1.759962244554479e-05, "loss": 0.0002, "num_tokens": 204049881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30361422213527656, "frac_reward_zero_std": 1.0, "grad_norm": 0.00025729751462606683, "kl": 0.018463134765625, "learning_rate": 1.7590737392688363e-05, "loss": 0.0002, "num_tokens": 204603081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3040054775761725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002362411225035243, "kl": 0.0195465087890625, "learning_rate": 1.758183817772163e-05, "loss": 0.0002, "num_tokens": 205156777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30439673301706854, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018566832859902483, "kl": 0.0197601318359375, "learning_rate": 1.757292481724794e-05, "loss": 0.0002, "num_tokens": 205708441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3047879884579645, "frac_reward_zero_std": 1.0, "grad_norm": 0.00013250476415133537, "kl": 0.01837158203125, "learning_rate": 1.7563997327897027e-05, "loss": 0.0002, "num_tokens": 206260841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3051792438988605, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001330378518449015, "kl": 0.0183563232421875, "learning_rate": 1.7555055726325e-05, "loss": 0.0002, "num_tokens": 206815129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30557049933975644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001116862709169225, "kl": 0.0182342529296875, "learning_rate": 1.7546100029214286e-05, "loss": 0.0002, "num_tokens": 207367961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3059617547806524, "frac_reward_zero_std": 1.0, "grad_norm": 0.00048328277612511836, "kl": 0.0178375244140625, "learning_rate": 1.7537130253273613e-05, "loss": 0.0002, "num_tokens": 207922105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30635301022154837, "frac_reward_zero_std": 1.0, "grad_norm": 8.879414674050903e-05, "kl": 0.017120361328125, "learning_rate": 1.7528146415237975e-05, "loss": 0.0002, "num_tokens": 208475961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3067442656624444, "frac_reward_zero_std": 1.0, "grad_norm": 6.502454234229536e-05, "kl": 0.0175628662109375, "learning_rate": 1.751914853186861e-05, "loss": 0.0002, "num_tokens": 209028777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30713552110334036, "frac_reward_zero_std": 1.0, "grad_norm": 0.000102004919226323, "kl": 0.017822265625, "learning_rate": 1.7510136619952947e-05, "loss": 0.0002, "num_tokens": 209583401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3075267765442363, "frac_reward_zero_std": 1.0, "grad_norm": 4.410866498305607e-05, "kl": 0.0189666748046875, "learning_rate": 1.7501110696304598e-05, "loss": 0.0002, "num_tokens": 210137433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3079180319851323, "frac_reward_zero_std": 1.0, "grad_norm": 5.1667236946655505e-05, "kl": 0.016571044921875, "learning_rate": 1.749207077776331e-05, "loss": 0.0002, "num_tokens": 210690697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30830928742602826, "frac_reward_zero_std": 1.0, "grad_norm": 3.6672246351713996e-05, "kl": 0.0177001953125, "learning_rate": 1.748301688119495e-05, "loss": 0.0002, "num_tokens": 211246137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3087005428669242, "frac_reward_zero_std": 1.0, "grad_norm": 6.292647097844618e-05, "kl": 0.0181884765625, "learning_rate": 1.7473949023491455e-05, "loss": 0.0002, "num_tokens": 211800441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.30909179830782024, "frac_reward_zero_std": 1.0, "grad_norm": 7.355674255151504e-05, "kl": 0.01824951171875, "learning_rate": 1.7464867221570815e-05, "loss": 0.0002, "num_tokens": 212353833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3094830537487162, "frac_reward_zero_std": 1.0, "grad_norm": 5.5820227149224815e-05, "kl": 0.01837158203125, "learning_rate": 1.745577149237703e-05, "loss": 0.0002, "num_tokens": 212905337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3098743091896122, "frac_reward_zero_std": 1.0, "grad_norm": 2.0459432040499735e-05, "kl": 0.0179290771484375, "learning_rate": 1.744666185288009e-05, "loss": 0.0002, "num_tokens": 213458905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31026556463050814, "frac_reward_zero_std": 1.0, "grad_norm": 2.8705210452984182e-05, "kl": 0.018280029296875, "learning_rate": 1.743753832007593e-05, "loss": 0.0002, "num_tokens": 214015689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3106568200714041, "frac_reward_zero_std": 1.0, "grad_norm": 4.6248571263856676e-05, "kl": 0.01690673828125, "learning_rate": 1.7428400910986422e-05, "loss": 0.0002, "num_tokens": 214570025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31104807551230007, "frac_reward_zero_std": 1.0, "grad_norm": 5.4041795575263335e-05, "kl": 0.0175323486328125, "learning_rate": 1.7419249642659308e-05, "loss": 0.0002, "num_tokens": 215124105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3114393309531961, "frac_reward_zero_std": 1.0, "grad_norm": 2.5471163180064e-05, "kl": 0.0173492431640625, "learning_rate": 1.7410084532168196e-05, "loss": 0.0002, "num_tokens": 215678169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31183058639409206, "frac_reward_zero_std": 1.0, "grad_norm": 2.8131112002885406e-05, "kl": 0.017486572265625, "learning_rate": 1.740090559661252e-05, "loss": 0.0002, "num_tokens": 216232457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.312221841834988, "frac_reward_zero_std": 1.0, "grad_norm": 4.392673698744952e-05, "kl": 0.0177764892578125, "learning_rate": 1.739171285311751e-05, "loss": 0.0002, "num_tokens": 216786041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.312613097275884, "frac_reward_zero_std": 1.0, "grad_norm": 5.5590946810252875e-05, "kl": 0.019317626953125, "learning_rate": 1.738250631883415e-05, "loss": 0.0002, "num_tokens": 217341753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31300435271677995, "frac_reward_zero_std": 1.0, "grad_norm": 3.5206488641020565e-05, "kl": 0.0179443359375, "learning_rate": 1.737328601093916e-05, "loss": 0.0002, "num_tokens": 217893945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3133956081576759, "frac_reward_zero_std": 1.0, "grad_norm": 4.0876695918997714e-05, "kl": 0.019073486328125, "learning_rate": 1.7364051946634953e-05, "loss": 0.0002, "num_tokens": 218449161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31378686359857194, "frac_reward_zero_std": 1.0, "grad_norm": 7.182267501483559e-05, "kl": 0.01824951171875, "learning_rate": 1.735480414314961e-05, "loss": 0.0002, "num_tokens": 219003529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3141781190394679, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003232504679451081, "kl": 0.0184783935546875, "learning_rate": 1.734554261773685e-05, "loss": 0.0002, "num_tokens": 219557033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3145693744803639, "frac_reward_zero_std": 1.0, "grad_norm": 3.761228054665221e-05, "kl": 0.0186767578125, "learning_rate": 1.7336267387675984e-05, "loss": 0.0002, "num_tokens": 220110665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31496062992125984, "frac_reward_zero_std": 1.0, "grad_norm": 3.93711598416929e-05, "kl": 0.01837158203125, "learning_rate": 1.7326978470271895e-05, "loss": 0.0002, "num_tokens": 220663545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3153518853621558, "frac_reward_zero_std": 1.0, "grad_norm": 4.3458162918050466e-05, "kl": 0.018157958984375, "learning_rate": 1.7317675882855004e-05, "loss": 0.0002, "num_tokens": 221215833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31574314080305177, "frac_reward_zero_std": 1.0, "grad_norm": 4.266445301412161e-05, "kl": 0.0175323486328125, "learning_rate": 1.730835964278124e-05, "loss": 0.0002, "num_tokens": 221771305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3161343962439478, "frac_reward_zero_std": 1.0, "grad_norm": 4.9371037771387227e-05, "kl": 0.0178070068359375, "learning_rate": 1.7299029767432e-05, "loss": 0.0002, "num_tokens": 222326841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31652565168484376, "frac_reward_zero_std": 1.0, "grad_norm": 2.4014634194593197e-05, "kl": 0.01873779296875, "learning_rate": 1.7289686274214116e-05, "loss": 0.0002, "num_tokens": 222879561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3169169071257397, "frac_reward_zero_std": 1.0, "grad_norm": 4.431903478607014e-05, "kl": 0.0182647705078125, "learning_rate": 1.7280329180559835e-05, "loss": 0.0002, "num_tokens": 223432665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3173081625666357, "frac_reward_zero_std": 1.0, "grad_norm": 5.86456148833827e-05, "kl": 0.0168304443359375, "learning_rate": 1.727095850392677e-05, "loss": 0.0002, "num_tokens": 223986985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31769941800753165, "frac_reward_zero_std": 1.0, "grad_norm": 3.569463227655019e-05, "kl": 0.0196685791015625, "learning_rate": 1.7261574261797887e-05, "loss": 0.0002, "num_tokens": 224542505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3180906734484276, "frac_reward_zero_std": 1.0, "grad_norm": 2.5167282315187383e-05, "kl": 0.0177459716796875, "learning_rate": 1.7252176471681452e-05, "loss": 0.0002, "num_tokens": 225095897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31848192888932364, "frac_reward_zero_std": 1.0, "grad_norm": 5.1471938798235306e-05, "kl": 0.0175933837890625, "learning_rate": 1.7242765151111008e-05, "loss": 0.0002, "num_tokens": 225650985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3188731843302196, "frac_reward_zero_std": 1.0, "grad_norm": 0.011373373313181367, "kl": 0.0240631103515625, "learning_rate": 1.7233340317645347e-05, "loss": 0.0002, "num_tokens": 226203785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31926443977111557, "frac_reward_zero_std": 1.0, "grad_norm": 3.540623306557071e-05, "kl": 0.016845703125, "learning_rate": 1.722390198886847e-05, "loss": 0.0002, "num_tokens": 226759305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.31965569521201154, "frac_reward_zero_std": 1.0, "grad_norm": 4.526699810512459e-05, "kl": 0.0169525146484375, "learning_rate": 1.721445018238956e-05, "loss": 0.0002, "num_tokens": 227313513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3200469506529075, "frac_reward_zero_std": 1.0, "grad_norm": 2.7718868605485577e-05, "kl": 0.0195159912109375, "learning_rate": 1.7204984915842934e-05, "loss": 0.0002, "num_tokens": 227868489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32043820609380347, "frac_reward_zero_std": 1.0, "grad_norm": 9.97216275485846e-06, "kl": 0.0182647705078125, "learning_rate": 1.719550620688803e-05, "loss": 0.0002, "num_tokens": 228422009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3208294615346995, "frac_reward_zero_std": 1.0, "grad_norm": 0.006593075337949505, "kl": 0.0206756591796875, "learning_rate": 1.718601407320937e-05, "loss": 0.0002, "num_tokens": 228978137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32122071697559546, "frac_reward_zero_std": 1.0, "grad_norm": 5.065573825814982e-05, "kl": 0.017059326171875, "learning_rate": 1.717650853251651e-05, "loss": 0.0002, "num_tokens": 229531209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3216119724164914, "frac_reward_zero_std": 1.0, "grad_norm": 3.3977754492766424e-05, "kl": 0.0182647705078125, "learning_rate": 1.7166989602544036e-05, "loss": 0.0002, "num_tokens": 230085369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3220032278573874, "frac_reward_zero_std": 1.0, "grad_norm": 3.3306323223493634e-05, "kl": 0.017730712890625, "learning_rate": 1.71574573010515e-05, "loss": 0.0002, "num_tokens": 230640137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32239448329828335, "frac_reward_zero_std": 1.0, "grad_norm": 2.5083147560571567e-05, "kl": 0.0190887451171875, "learning_rate": 1.7147911645823404e-05, "loss": 0.0002, "num_tokens": 231195177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3227857387391793, "frac_reward_zero_std": 1.0, "grad_norm": 3.157878491211884e-05, "kl": 0.01776123046875, "learning_rate": 1.713835265466917e-05, "loss": 0.0002, "num_tokens": 231746921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32317699418007534, "frac_reward_zero_std": 1.0, "grad_norm": 4.7089295651541446e-05, "kl": 0.0189361572265625, "learning_rate": 1.71287803454231e-05, "loss": 0.0002, "num_tokens": 232298985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3235682496209713, "frac_reward_zero_std": 1.0, "grad_norm": 4.0023916780486866e-05, "kl": 0.0185699462890625, "learning_rate": 1.7119194735944336e-05, "loss": 0.0002, "num_tokens": 232852841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32395950506186727, "frac_reward_zero_std": 1.0, "grad_norm": 2.397658486555914e-05, "kl": 0.01910400390625, "learning_rate": 1.710959584411685e-05, "loss": 0.0002, "num_tokens": 233407209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32435076050276324, "frac_reward_zero_std": 1.0, "grad_norm": 0.00010764563499505006, "kl": 0.0164947509765625, "learning_rate": 1.7099983687849374e-05, "loss": 0.0002, "num_tokens": 233962153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3247420159436592, "frac_reward_zero_std": 1.0, "grad_norm": 2.7739657466387403e-05, "kl": 0.018524169921875, "learning_rate": 1.7090358285075403e-05, "loss": 0.0002, "num_tokens": 234517033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32513327138455517, "frac_reward_zero_std": 1.0, "grad_norm": 4.140357829342944e-05, "kl": 0.018310546875, "learning_rate": 1.7080719653753143e-05, "loss": 0.0002, "num_tokens": 235069849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3255245268254512, "frac_reward_zero_std": 1.0, "grad_norm": 5.4536996820856524e-05, "kl": 0.0175628662109375, "learning_rate": 1.7071067811865477e-05, "loss": 0.0002, "num_tokens": 235622585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32591578226634715, "frac_reward_zero_std": 1.0, "grad_norm": 6.474876693961015e-05, "kl": 0.018035888671875, "learning_rate": 1.706140277741994e-05, "loss": 0.0002, "num_tokens": 236177193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3263070377072431, "frac_reward_zero_std": 1.0, "grad_norm": 1.8574695373932683e-05, "kl": 0.017608642578125, "learning_rate": 1.7051724568448677e-05, "loss": 0.0002, "num_tokens": 236731801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3266982931481391, "frac_reward_zero_std": 1.0, "grad_norm": 6.341500001045902e-05, "kl": 0.017486572265625, "learning_rate": 1.7042033203008408e-05, "loss": 0.0002, "num_tokens": 237285369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32708954858903505, "frac_reward_zero_std": 1.0, "grad_norm": 3.213490411923467e-05, "kl": 0.01837158203125, "learning_rate": 1.703232869918041e-05, "loss": 0.0002, "num_tokens": 237839241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.327480804029931, "frac_reward_zero_std": 1.0, "grad_norm": 4.362313811609553e-05, "kl": 0.019256591796875, "learning_rate": 1.7022611075070476e-05, "loss": 0.0002, "num_tokens": 238393833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32787205947082704, "frac_reward_zero_std": 1.0, "grad_norm": 4.016064646206909e-05, "kl": 0.0164794921875, "learning_rate": 1.7012880348808852e-05, "loss": 0.0002, "num_tokens": 238947449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.328263314911723, "frac_reward_zero_std": 1.0, "grad_norm": 5.725493176438385e-05, "kl": 0.0188140869140625, "learning_rate": 1.7003136538550255e-05, "loss": 0.0002, "num_tokens": 239503993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32865457035261897, "frac_reward_zero_std": 1.0, "grad_norm": 4.539709747255102e-05, "kl": 0.01702880859375, "learning_rate": 1.6993379662473804e-05, "loss": 0.0002, "num_tokens": 240058473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32904582579351493, "frac_reward_zero_std": 1.0, "grad_norm": 3.6864474471172426e-05, "kl": 0.0184783935546875, "learning_rate": 1.6983609738782993e-05, "loss": 0.0002, "num_tokens": 240612585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3294370812344109, "frac_reward_zero_std": 1.0, "grad_norm": 2.5541788143820893e-05, "kl": 0.0189056396484375, "learning_rate": 1.697382678570566e-05, "loss": 0.0002, "num_tokens": 241168777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.32982833667530687, "frac_reward_zero_std": 1.0, "grad_norm": 9.125394443181494e-05, "kl": 0.0175628662109375, "learning_rate": 1.6964030821493953e-05, "loss": 0.0002, "num_tokens": 241721881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3302195921162029, "frac_reward_zero_std": 1.0, "grad_norm": 3.869978388165434e-05, "kl": 0.017852783203125, "learning_rate": 1.6954221864424297e-05, "loss": 0.0002, "num_tokens": 242278121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33061084755709885, "frac_reward_zero_std": 1.0, "grad_norm": 3.820229186468694e-05, "kl": 0.018218994140625, "learning_rate": 1.6944399932797355e-05, "loss": 0.0002, "num_tokens": 242830025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3310021029979948, "frac_reward_zero_std": 1.0, "grad_norm": 3.5667816865059206e-05, "kl": 0.018035888671875, "learning_rate": 1.693456504493799e-05, "loss": 0.0002, "num_tokens": 243387273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3313933584388908, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020787481400130704, "kl": 0.0177459716796875, "learning_rate": 1.6924717219195258e-05, "loss": 0.0002, "num_tokens": 243939833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33178461387978675, "frac_reward_zero_std": 1.0, "grad_norm": 7.066109947862126e-05, "kl": 0.017578125, "learning_rate": 1.6914856473942333e-05, "loss": 0.0002, "num_tokens": 244492905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3321758693206827, "frac_reward_zero_std": 1.0, "grad_norm": 4.0281700036707824e-05, "kl": 0.0181427001953125, "learning_rate": 1.6904982827576498e-05, "loss": 0.0002, "num_tokens": 245048057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33256712476157874, "frac_reward_zero_std": 1.0, "grad_norm": 4.356986957245489e-05, "kl": 0.018341064453125, "learning_rate": 1.6895096298519113e-05, "loss": 0.0002, "num_tokens": 245601209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3329583802024747, "frac_reward_zero_std": 1.0, "grad_norm": 3.599028397252498e-05, "kl": 0.0179290771484375, "learning_rate": 1.6885196905215567e-05, "loss": 0.0002, "num_tokens": 246154953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33334963564337067, "frac_reward_zero_std": 1.0, "grad_norm": 6.90664075016459e-05, "kl": 0.0186004638671875, "learning_rate": 1.6875284666135252e-05, "loss": 0.0002, "num_tokens": 246709145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33374089108426663, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001727704656662551, "kl": 0.017730712890625, "learning_rate": 1.686535959977152e-05, "loss": 0.0002, "num_tokens": 247262761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3341321465251626, "frac_reward_zero_std": 1.0, "grad_norm": 3.3453569979254156e-05, "kl": 0.01776123046875, "learning_rate": 1.6855421724641672e-05, "loss": 0.0002, "num_tokens": 247816841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33452340196605856, "frac_reward_zero_std": 1.0, "grad_norm": 2.5155436163844837e-05, "kl": 0.0179443359375, "learning_rate": 1.684547105928689e-05, "loss": 0.0002, "num_tokens": 248371721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3349146574069546, "frac_reward_zero_std": 1.0, "grad_norm": 0.00020227897092945713, "kl": 0.01788330078125, "learning_rate": 1.6835507622272217e-05, "loss": 0.0002, "num_tokens": 248926873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33530591284785055, "frac_reward_zero_std": 1.0, "grad_norm": 4.869550951536859e-05, "kl": 0.0162811279296875, "learning_rate": 1.6825531432186545e-05, "loss": 0.0002, "num_tokens": 249479961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3356971682887465, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001606193072683935, "kl": 0.016143798828125, "learning_rate": 1.6815542507642532e-05, "loss": 0.0002, "num_tokens": 250034169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3360884237296425, "frac_reward_zero_std": 1.0, "grad_norm": 2.843176527415468e-05, "kl": 0.0184326171875, "learning_rate": 1.680554086727662e-05, "loss": 0.0002, "num_tokens": 250588361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33647967917053845, "frac_reward_zero_std": 1.0, "grad_norm": 3.6390467917432505e-05, "kl": 0.01751708984375, "learning_rate": 1.6795526529748954e-05, "loss": 0.0002, "num_tokens": 251143209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3368709346114344, "frac_reward_zero_std": 1.0, "grad_norm": 3.6942762741286546e-05, "kl": 0.017974853515625, "learning_rate": 1.6785499513743383e-05, "loss": 0.0002, "num_tokens": 251695961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33726219005233044, "frac_reward_zero_std": 1.0, "grad_norm": 4.617159669975267e-05, "kl": 0.01715087890625, "learning_rate": 1.677545983796741e-05, "loss": 0.0002, "num_tokens": 252248617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3376534454932264, "frac_reward_zero_std": 1.0, "grad_norm": 0.0002350708529822999, "kl": 0.0179290771484375, "learning_rate": 1.6765407521152145e-05, "loss": 0.0002, "num_tokens": 252802633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33804470093412237, "frac_reward_zero_std": 1.0, "grad_norm": 4.566512368895337e-05, "kl": 0.0178070068359375, "learning_rate": 1.6755342582052293e-05, "loss": 0.0002, "num_tokens": 253357065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33843595637501833, "frac_reward_zero_std": 1.0, "grad_norm": 4.2381905439236994e-05, "kl": 0.0186309814453125, "learning_rate": 1.674526503944611e-05, "loss": 0.0002, "num_tokens": 253912297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3388272118159143, "frac_reward_zero_std": 1.0, "grad_norm": 0.00038708951400586307, "kl": 0.018035888671875, "learning_rate": 1.673517491213535e-05, "loss": 0.0002, "num_tokens": 254466713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.33921846725681026, "frac_reward_zero_std": 1.0, "grad_norm": 4.826910353851571e-05, "kl": 0.017822265625, "learning_rate": 1.6725072218945274e-05, "loss": 0.0002, "num_tokens": 255023625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3396097226977063, "frac_reward_zero_std": 1.0, "grad_norm": 5.4421716368367756e-05, "kl": 0.017578125, "learning_rate": 1.6714956978724563e-05, "loss": 0.0002, "num_tokens": 255578249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34000097813860225, "frac_reward_zero_std": 1.0, "grad_norm": 7.13073151675867e-05, "kl": 0.018463134765625, "learning_rate": 1.6704829210345317e-05, "loss": 0.0002, "num_tokens": 256131145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3403922335794982, "frac_reward_zero_std": 1.0, "grad_norm": 4.087759973388543e-05, "kl": 0.0184478759765625, "learning_rate": 1.6694688932703007e-05, "loss": 0.0002, "num_tokens": 256686713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3407834890203942, "frac_reward_zero_std": 1.0, "grad_norm": 5.591211475669507e-05, "kl": 0.018646240234375, "learning_rate": 1.668453616471645e-05, "loss": 0.0002, "num_tokens": 257238793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34117474446129015, "frac_reward_zero_std": 1.0, "grad_norm": 5.114702412361813e-05, "kl": 0.0168914794921875, "learning_rate": 1.667437092532776e-05, "loss": 0.0002, "num_tokens": 257792185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3415659999021861, "frac_reward_zero_std": 1.0, "grad_norm": 8.552133629028633e-05, "kl": 0.0168609619140625, "learning_rate": 1.6664193233502314e-05, "loss": 0.0002, "num_tokens": 258345465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34195725534308213, "frac_reward_zero_std": 1.0, "grad_norm": 3.020146093127411e-05, "kl": 0.017425537109375, "learning_rate": 1.6654003108228735e-05, "loss": 0.0002, "num_tokens": 258897593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3423485107839781, "frac_reward_zero_std": 1.0, "grad_norm": 3.832141399105482e-05, "kl": 0.017333984375, "learning_rate": 1.6643800568518832e-05, "loss": 0.0002, "num_tokens": 259451321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34273976622487407, "frac_reward_zero_std": 1.0, "grad_norm": 5.34624620376165e-05, "kl": 0.0184326171875, "learning_rate": 1.6633585633407577e-05, "loss": 0.0002, "num_tokens": 260006793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34313102166577003, "frac_reward_zero_std": 1.0, "grad_norm": 4.0122395148234764e-05, "kl": 0.0183563232421875, "learning_rate": 1.662335832195308e-05, "loss": 0.0002, "num_tokens": 260559257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.343522277106666, "frac_reward_zero_std": 1.0, "grad_norm": 3.6445221680966756e-05, "kl": 0.01678466796875, "learning_rate": 1.661311865323652e-05, "loss": 0.0002, "num_tokens": 261115113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34391353254756196, "frac_reward_zero_std": 1.0, "grad_norm": 5.053450787402038e-05, "kl": 0.0182952880859375, "learning_rate": 1.6602866646362152e-05, "loss": 0.0002, "num_tokens": 261670585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.344304787988458, "frac_reward_zero_std": 1.0, "grad_norm": 3.2816895398693327e-05, "kl": 0.0169830322265625, "learning_rate": 1.659260232045724e-05, "loss": 0.0002, "num_tokens": 262225097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34469604342935395, "frac_reward_zero_std": 1.0, "grad_norm": 4.881095248358812e-05, "kl": 0.018157958984375, "learning_rate": 1.6582325694672032e-05, "loss": 0.0002, "num_tokens": 262779593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3450872988702499, "frac_reward_zero_std": 1.0, "grad_norm": 4.4374506324221935e-05, "kl": 0.0188751220703125, "learning_rate": 1.6572036788179728e-05, "loss": 0.0002, "num_tokens": 263332425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3454785543111459, "frac_reward_zero_std": 1.0, "grad_norm": 5.346180774855682e-05, "kl": 0.01910400390625, "learning_rate": 1.6561735620176433e-05, "loss": 0.0002, "num_tokens": 263887081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34586980975204185, "frac_reward_zero_std": 1.0, "grad_norm": 4.33298102256824e-05, "kl": 0.01776123046875, "learning_rate": 1.655142220988114e-05, "loss": 0.0002, "num_tokens": 264439865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34626106519293787, "frac_reward_zero_std": 1.0, "grad_norm": 5.096020998186922e-05, "kl": 0.0186614990234375, "learning_rate": 1.654109657653567e-05, "loss": 0.0002, "num_tokens": 264993065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34665232063383383, "frac_reward_zero_std": 1.0, "grad_norm": 4.2250349086803006e-05, "kl": 0.0186309814453125, "learning_rate": 1.6530758739404658e-05, "loss": 0.0002, "num_tokens": 265546585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3470435760747298, "frac_reward_zero_std": 1.0, "grad_norm": 5.96980923787188e-05, "kl": 0.0165863037109375, "learning_rate": 1.6520408717775507e-05, "loss": 0.0002, "num_tokens": 266098889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34743483151562576, "frac_reward_zero_std": 1.0, "grad_norm": 3.2691714192078764e-05, "kl": 0.01800537109375, "learning_rate": 1.6510046530958343e-05, "loss": 0.0002, "num_tokens": 266651817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34782608695652173, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015763214927596558, "kl": 0.01904296875, "learning_rate": 1.6499672198285996e-05, "loss": 0.0002, "num_tokens": 267206377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3482173423974177, "frac_reward_zero_std": 1.0, "grad_norm": 4.959008333240577e-05, "kl": 0.018035888671875, "learning_rate": 1.6489285739113964e-05, "loss": 0.0002, "num_tokens": 267759897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3486085978383137, "frac_reward_zero_std": 1.0, "grad_norm": 2.777334923403387e-05, "kl": 0.017608642578125, "learning_rate": 1.6478887172820353e-05, "loss": 0.0002, "num_tokens": 268314665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3489998532792097, "frac_reward_zero_std": 1.0, "grad_norm": 0.002130304177635319, "kl": 0.017852783203125, "learning_rate": 1.6468476518805872e-05, "loss": 0.0002, "num_tokens": 268867833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.34939110872010565, "frac_reward_zero_std": 1.0, "grad_norm": 4.348426527437145e-05, "kl": 0.0181427001953125, "learning_rate": 1.645805379649377e-05, "loss": 0.0002, "num_tokens": 269420217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3497823641610016, "frac_reward_zero_std": 1.0, "grad_norm": 4.053641828923069e-05, "kl": 0.0185699462890625, "learning_rate": 1.644761902532983e-05, "loss": 0.0002, "num_tokens": 269973321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3501736196018976, "frac_reward_zero_std": 1.0, "grad_norm": 0.0003510183815779374, "kl": 0.0174560546875, "learning_rate": 1.643717222478229e-05, "loss": 0.0002, "num_tokens": 270528729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35056487504279354, "frac_reward_zero_std": 1.0, "grad_norm": 2.9526398043626788e-05, "kl": 0.0179290771484375, "learning_rate": 1.6426713414341852e-05, "loss": 0.0002, "num_tokens": 271083513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35095613048368957, "frac_reward_zero_std": 1.0, "grad_norm": 3.543760196398638e-05, "kl": 0.0186767578125, "learning_rate": 1.6416242613521612e-05, "loss": 0.0002, "num_tokens": 271636841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35134738592458553, "frac_reward_zero_std": 1.0, "grad_norm": 5.308026049401396e-05, "kl": 0.0169219970703125, "learning_rate": 1.6405759841857047e-05, "loss": 0.0002, "num_tokens": 272190713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3517386413654815, "frac_reward_zero_std": 1.0, "grad_norm": 5.104272981059723e-05, "kl": 0.0176239013671875, "learning_rate": 1.639526511890596e-05, "loss": 0.0002, "num_tokens": 272744313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35212989680637746, "frac_reward_zero_std": 1.0, "grad_norm": 3.4424510272926175e-05, "kl": 0.019378662109375, "learning_rate": 1.6384758464248456e-05, "loss": 0.0002, "num_tokens": 273297769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35252115224727343, "frac_reward_zero_std": 1.0, "grad_norm": 5.6597477111799444e-05, "kl": 0.0170745849609375, "learning_rate": 1.63742398974869e-05, "loss": 0.0002, "num_tokens": 273851289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3529124076881694, "frac_reward_zero_std": 1.0, "grad_norm": 4.117689868206798e-05, "kl": 0.018096923828125, "learning_rate": 1.6363709438245877e-05, "loss": 0.0002, "num_tokens": 274404953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3533036631290654, "frac_reward_zero_std": 1.0, "grad_norm": 4.5528609778868135e-05, "kl": 0.01849365234375, "learning_rate": 1.635316710617217e-05, "loss": 0.0002, "num_tokens": 274961993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3536949185699614, "frac_reward_zero_std": 1.0, "grad_norm": 4.892978863209686e-05, "kl": 0.0181121826171875, "learning_rate": 1.6342612920934703e-05, "loss": 0.0002, "num_tokens": 275517001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35408617401085735, "frac_reward_zero_std": 1.0, "grad_norm": 3.1941832073198447e-05, "kl": 0.01800537109375, "learning_rate": 1.6332046902224518e-05, "loss": 0.0002, "num_tokens": 276069305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3544774294517533, "frac_reward_zero_std": 1.0, "grad_norm": 1.677334422954987e-05, "kl": 0.0176849365234375, "learning_rate": 1.632146906975474e-05, "loss": 0.0002, "num_tokens": 276623673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3548686848926493, "frac_reward_zero_std": 1.0, "grad_norm": 5.401354725647064e-05, "kl": 0.017364501953125, "learning_rate": 1.631087944326053e-05, "loss": 0.0002, "num_tokens": 277178601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35525994033354524, "frac_reward_zero_std": 1.0, "grad_norm": 3.663154491099716e-05, "kl": 0.017913818359375, "learning_rate": 1.630027804249905e-05, "loss": 0.0002, "num_tokens": 277732329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35565119577444126, "frac_reward_zero_std": 1.0, "grad_norm": 6.519119052635021e-05, "kl": 0.0165863037109375, "learning_rate": 1.628966488724944e-05, "loss": 0.0002, "num_tokens": 278286265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35604245121533723, "frac_reward_zero_std": 1.0, "grad_norm": 2.8193158787065635e-05, "kl": 0.0182037353515625, "learning_rate": 1.6279039997312755e-05, "loss": 0.0002, "num_tokens": 278840297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3564337066562332, "frac_reward_zero_std": 1.0, "grad_norm": 8.595171182451131e-05, "kl": 0.01788330078125, "learning_rate": 1.626840339251196e-05, "loss": 0.0002, "num_tokens": 279396857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35682496209712916, "frac_reward_zero_std": 1.0, "grad_norm": 3.898986414792195e-05, "kl": 0.0172576904296875, "learning_rate": 1.6257755092691865e-05, "loss": 0.0002, "num_tokens": 279951273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3572162175380251, "frac_reward_zero_std": 1.0, "grad_norm": 4.2627550033729355e-05, "kl": 0.018707275390625, "learning_rate": 1.6247095117719106e-05, "loss": 0.0002, "num_tokens": 280506985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3576074729789211, "frac_reward_zero_std": 1.0, "grad_norm": 4.9337216329529484e-05, "kl": 0.019195556640625, "learning_rate": 1.6236423487482097e-05, "loss": 0.0002, "num_tokens": 281059705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3579987284198171, "frac_reward_zero_std": 1.0, "grad_norm": 5.020964318546325e-05, "kl": 0.01690673828125, "learning_rate": 1.6225740221890996e-05, "loss": 0.0002, "num_tokens": 281614217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3583899838607131, "frac_reward_zero_std": 1.0, "grad_norm": 2.7632972773277246e-05, "kl": 0.0174713134765625, "learning_rate": 1.621504534087768e-05, "loss": 0.0002, "num_tokens": 282168409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35878123930160905, "frac_reward_zero_std": 1.0, "grad_norm": 2.1333047267947684e-05, "kl": 0.0179290771484375, "learning_rate": 1.6204338864395683e-05, "loss": 0.0002, "num_tokens": 282721545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.359172494742505, "frac_reward_zero_std": 1.0, "grad_norm": 4.216602093374114e-05, "kl": 0.0190887451171875, "learning_rate": 1.6193620812420177e-05, "loss": 0.0002, "num_tokens": 283275369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.359563750183401, "frac_reward_zero_std": 1.0, "grad_norm": 7.76577232223183e-05, "kl": 0.018341064453125, "learning_rate": 1.6182891204947942e-05, "loss": 0.0002, "num_tokens": 283827961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.35995500562429694, "frac_reward_zero_std": 1.0, "grad_norm": 4.116486115869022e-05, "kl": 0.01904296875, "learning_rate": 1.61721500619973e-05, "loss": 0.0002, "num_tokens": 284381049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36034626106519296, "frac_reward_zero_std": 1.0, "grad_norm": 5.043688443140699e-05, "kl": 0.0194244384765625, "learning_rate": 1.616139740360811e-05, "loss": 0.0002, "num_tokens": 284936185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36073751650608893, "frac_reward_zero_std": 1.0, "grad_norm": 4.651004835965276e-05, "kl": 0.0174713134765625, "learning_rate": 1.6150633249841696e-05, "loss": 0.0002, "num_tokens": 285491033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3611287719469849, "frac_reward_zero_std": 1.0, "grad_norm": 4.995701491140027e-05, "kl": 0.018096923828125, "learning_rate": 1.6139857620780852e-05, "loss": 0.0002, "num_tokens": 286047545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36152002738788086, "frac_reward_zero_std": 1.0, "grad_norm": 5.050607972413286e-05, "kl": 0.01849365234375, "learning_rate": 1.6129070536529767e-05, "loss": 0.0002, "num_tokens": 286601609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3619112828287768, "frac_reward_zero_std": 1.0, "grad_norm": 3.307054352706149e-05, "kl": 0.017303466796875, "learning_rate": 1.6118272017214005e-05, "loss": 0.0002, "num_tokens": 287156009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3623025382696728, "frac_reward_zero_std": 1.0, "grad_norm": 4.3351861894573594e-05, "kl": 0.0174407958984375, "learning_rate": 1.6107462082980468e-05, "loss": 0.0002, "num_tokens": 287710601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3626937937105688, "frac_reward_zero_std": 1.0, "grad_norm": 6.557250658772364e-05, "kl": 0.0177154541015625, "learning_rate": 1.609664075399735e-05, "loss": 0.0002, "num_tokens": 288264889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3630850491514648, "frac_reward_zero_std": 1.0, "grad_norm": 8.209541618771883e-05, "kl": 0.0177154541015625, "learning_rate": 1.60858080504541e-05, "loss": 0.0002, "num_tokens": 288822281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36347630459236074, "frac_reward_zero_std": 1.0, "grad_norm": 3.14942377527897e-05, "kl": 0.0183563232421875, "learning_rate": 1.607496399256141e-05, "loss": 0.0002, "num_tokens": 289377097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3638675600332567, "frac_reward_zero_std": 1.0, "grad_norm": 3.422357547909548e-05, "kl": 0.01715087890625, "learning_rate": 1.606410860055113e-05, "loss": 0.0002, "num_tokens": 289929049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3642588154741527, "frac_reward_zero_std": 1.0, "grad_norm": 3.8683175605576916e-05, "kl": 0.0181121826171875, "learning_rate": 1.605324189467627e-05, "loss": 0.0002, "num_tokens": 290483049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36465007091504864, "frac_reward_zero_std": 1.0, "grad_norm": 5.236039683873024e-05, "kl": 0.0170440673828125, "learning_rate": 1.6042363895210948e-05, "loss": 0.0002, "num_tokens": 291035097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36504132635594466, "frac_reward_zero_std": 1.0, "grad_norm": 4.457554875924166e-05, "kl": 0.0181427001953125, "learning_rate": 1.6031474622450346e-05, "loss": 0.0002, "num_tokens": 291587033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3654325817968406, "frac_reward_zero_std": 1.0, "grad_norm": 4.515224918497966e-05, "kl": 0.018463134765625, "learning_rate": 1.6020574096710684e-05, "loss": 0.0002, "num_tokens": 292142521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3658238372377366, "frac_reward_zero_std": 1.0, "grad_norm": 6.178232865500509e-05, "kl": 0.0186004638671875, "learning_rate": 1.6009662338329177e-05, "loss": 0.0002, "num_tokens": 292694489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36621509267863256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009161539378941522, "kl": 0.0174560546875, "learning_rate": 1.5998739367663995e-05, "loss": 0.0002, "num_tokens": 293248073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3666063481195285, "frac_reward_zero_std": 1.0, "grad_norm": 4.608213170777319e-05, "kl": 0.0185699462890625, "learning_rate": 1.5987805205094225e-05, "loss": 0.0002, "num_tokens": 293799945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3669976035604245, "frac_reward_zero_std": 1.0, "grad_norm": 5.660559466840388e-05, "kl": 0.0186767578125, "learning_rate": 1.5976859871019847e-05, "loss": 0.0002, "num_tokens": 294354969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3673888590013205, "frac_reward_zero_std": 1.0, "grad_norm": 2.677575858776083e-05, "kl": 0.017120361328125, "learning_rate": 1.596590338586166e-05, "loss": 0.0002, "num_tokens": 294908809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3677801144422165, "frac_reward_zero_std": 1.0, "grad_norm": 4.3144849148542e-05, "kl": 0.017120361328125, "learning_rate": 1.5954935770061286e-05, "loss": 0.0002, "num_tokens": 295464761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36817136988311244, "frac_reward_zero_std": 1.0, "grad_norm": 5.610147846849814e-05, "kl": 0.0175018310546875, "learning_rate": 1.5943957044081117e-05, "loss": 0.0002, "num_tokens": 296019481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3685626253240084, "frac_reward_zero_std": 1.0, "grad_norm": 5.2940156096837656e-05, "kl": 0.0187530517578125, "learning_rate": 1.5932967228404255e-05, "loss": 0.0002, "num_tokens": 296572985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3689538807649044, "frac_reward_zero_std": 1.0, "grad_norm": 2.811144040252028e-05, "kl": 0.01751708984375, "learning_rate": 1.5921966343534508e-05, "loss": 0.0002, "num_tokens": 297127113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36934513620580034, "frac_reward_zero_std": 1.0, "grad_norm": 4.567988017686094e-05, "kl": 0.0190277099609375, "learning_rate": 1.5910954409996332e-05, "loss": 0.0002, "num_tokens": 297679225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.36973639164669636, "frac_reward_zero_std": 1.0, "grad_norm": 1.395109376768414e-05, "kl": 0.017425537109375, "learning_rate": 1.5899931448334788e-05, "loss": 0.0002, "num_tokens": 298233529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3701276470875923, "frac_reward_zero_std": 1.0, "grad_norm": 0.00011309084288067398, "kl": 0.0182952880859375, "learning_rate": 1.588889747911553e-05, "loss": 0.0002, "num_tokens": 298786649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3705189025284883, "frac_reward_zero_std": 1.0, "grad_norm": 4.5492847315410574e-05, "kl": 0.01971435546875, "learning_rate": 1.5877852522924733e-05, "loss": 0.0002, "num_tokens": 299340489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37091015796938426, "frac_reward_zero_std": 1.0, "grad_norm": 4.078617384909745e-05, "kl": 0.017333984375, "learning_rate": 1.5866796600369073e-05, "loss": 0.0002, "num_tokens": 299894825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3713014134102802, "frac_reward_zero_std": 1.0, "grad_norm": 3.654289368113481e-05, "kl": 0.01751708984375, "learning_rate": 1.5855729732075696e-05, "loss": 0.0002, "num_tokens": 300447881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3716926688511762, "frac_reward_zero_std": 1.0, "grad_norm": 4.286569145443988e-05, "kl": 0.0175628662109375, "learning_rate": 1.5844651938692158e-05, "loss": 0.0002, "num_tokens": 301001241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3720839242920722, "frac_reward_zero_std": 1.0, "grad_norm": 4.060248487488138e-05, "kl": 0.0186614990234375, "learning_rate": 1.5833563240886398e-05, "loss": 0.0002, "num_tokens": 301554457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3724751797329682, "frac_reward_zero_std": 1.0, "grad_norm": 4.99682876720341e-05, "kl": 0.018402099609375, "learning_rate": 1.582246365934671e-05, "loss": 0.0002, "num_tokens": 302110281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37286643517386414, "frac_reward_zero_std": 1.0, "grad_norm": 7.24130831211361e-05, "kl": 0.017791748046875, "learning_rate": 1.581135321478169e-05, "loss": 0.0002, "num_tokens": 302663385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3732576906147601, "frac_reward_zero_std": 1.0, "grad_norm": 3.0393088190893223e-05, "kl": 0.01654052734375, "learning_rate": 1.5800231927920196e-05, "loss": 0.0002, "num_tokens": 303220537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3736489460556561, "frac_reward_zero_std": 1.0, "grad_norm": 5.602958533046215e-05, "kl": 0.0178985595703125, "learning_rate": 1.5789099819511317e-05, "loss": 0.0002, "num_tokens": 303773001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37404020149655204, "frac_reward_zero_std": 1.0, "grad_norm": 2.2057575614966176e-05, "kl": 0.0167999267578125, "learning_rate": 1.5777956910324333e-05, "loss": 0.0002, "num_tokens": 304326521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37443145693744806, "frac_reward_zero_std": 1.0, "grad_norm": 2.8176684225418554e-05, "kl": 0.0174713134765625, "learning_rate": 1.5766803221148676e-05, "loss": 0.0002, "num_tokens": 304880409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.374822712378344, "frac_reward_zero_std": 1.0, "grad_norm": 3.606521970289035e-05, "kl": 0.018707275390625, "learning_rate": 1.5755638772793883e-05, "loss": 0.0002, "num_tokens": 305434649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37521396781924, "frac_reward_zero_std": 1.0, "grad_norm": 7.787789242469531e-05, "kl": 0.01824951171875, "learning_rate": 1.574446358608958e-05, "loss": 0.0002, "num_tokens": 305989033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37560522326013596, "frac_reward_zero_std": 1.0, "grad_norm": 3.9980698526958986e-05, "kl": 0.0181884765625, "learning_rate": 1.5733277681885413e-05, "loss": 0.0002, "num_tokens": 306544473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3759964787010319, "frac_reward_zero_std": 1.0, "grad_norm": 5.607981490787606e-05, "kl": 0.0192413330078125, "learning_rate": 1.5722081081051032e-05, "loss": 0.0002, "num_tokens": 307097689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3763877341419279, "frac_reward_zero_std": 1.0, "grad_norm": 3.5902212854173695e-05, "kl": 0.017974853515625, "learning_rate": 1.5710873804476035e-05, "loss": 0.0002, "num_tokens": 307653145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3767789895828239, "frac_reward_zero_std": 1.0, "grad_norm": 3.4284056348916966e-05, "kl": 0.0185546875, "learning_rate": 1.569965587306995e-05, "loss": 0.0002, "num_tokens": 308207113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3771702450237199, "frac_reward_zero_std": 1.0, "grad_norm": 0.00014474679788230437, "kl": 0.018585205078125, "learning_rate": 1.5688427307762172e-05, "loss": 0.0002, "num_tokens": 308758537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37756150046461584, "frac_reward_zero_std": 1.0, "grad_norm": 5.294072907370373e-05, "kl": 0.0192108154296875, "learning_rate": 1.5677188129501942e-05, "loss": 0.0002, "num_tokens": 309312745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3779527559055118, "frac_reward_zero_std": 1.0, "grad_norm": 4.624360220736297e-05, "kl": 0.018310546875, "learning_rate": 1.5665938359258307e-05, "loss": 0.0002, "num_tokens": 309865817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37834401134640777, "frac_reward_zero_std": 1.0, "grad_norm": 3.055459056731959e-05, "kl": 0.016387939453125, "learning_rate": 1.565467801802006e-05, "loss": 0.0002, "num_tokens": 310417929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37873526678730374, "frac_reward_zero_std": 1.0, "grad_norm": 3.111283234386637e-05, "kl": 0.0174560546875, "learning_rate": 1.5643407126795732e-05, "loss": 0.0002, "num_tokens": 310971833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.37912652222819976, "frac_reward_zero_std": 1.0, "grad_norm": 5.135275616568401e-05, "kl": 0.017608642578125, "learning_rate": 1.5632125706613534e-05, "loss": 0.0002, "num_tokens": 311525417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3795177776690957, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016084959796020923, "kl": 0.0199432373046875, "learning_rate": 1.5620833778521306e-05, "loss": 0.0002, "num_tokens": 312077561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3799090331099917, "frac_reward_zero_std": 1.0, "grad_norm": 4.6122225396479906e-05, "kl": 0.0188140869140625, "learning_rate": 1.5609531363586516e-05, "loss": 0.0002, "num_tokens": 312630841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38030028855088766, "frac_reward_zero_std": 1.0, "grad_norm": 4.3837652535421416e-05, "kl": 0.0167999267578125, "learning_rate": 1.5598218482896182e-05, "loss": 0.0002, "num_tokens": 313184745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3806915439917836, "frac_reward_zero_std": 1.0, "grad_norm": 4.9453597025246734e-05, "kl": 0.017730712890625, "learning_rate": 1.5586895157556854e-05, "loss": 0.0002, "num_tokens": 313739849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3810827994326796, "frac_reward_zero_std": 1.0, "grad_norm": 5.5413081592472504e-05, "kl": 0.017791748046875, "learning_rate": 1.5575561408694563e-05, "loss": 0.0002, "num_tokens": 314296921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3814740548735756, "frac_reward_zero_std": 1.0, "grad_norm": 6.428921405379149e-05, "kl": 0.0178985595703125, "learning_rate": 1.5564217257454796e-05, "loss": 0.0002, "num_tokens": 314850745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3818653103144716, "frac_reward_zero_std": 1.0, "grad_norm": 4.078715822946164e-05, "kl": 0.0196075439453125, "learning_rate": 1.555286272500244e-05, "loss": 0.0002, "num_tokens": 315403833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38225656575536754, "frac_reward_zero_std": 1.0, "grad_norm": 2.323986777674999e-05, "kl": 0.016876220703125, "learning_rate": 1.554149783252175e-05, "loss": 0.0002, "num_tokens": 315960153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3826478211962635, "frac_reward_zero_std": 1.0, "grad_norm": 3.5393836692705365e-05, "kl": 0.0178680419921875, "learning_rate": 1.5530122601216318e-05, "loss": 0.0002, "num_tokens": 316513929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38303907663715947, "frac_reward_zero_std": 1.0, "grad_norm": 4.6024339426862246e-05, "kl": 0.0199737548828125, "learning_rate": 1.551873705230902e-05, "loss": 0.0002, "num_tokens": 317069033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38343033207805544, "frac_reward_zero_std": 1.0, "grad_norm": 5.2275304446829655e-05, "kl": 0.01641845703125, "learning_rate": 1.5507341207041983e-05, "loss": 0.0002, "num_tokens": 317623097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38382158751895146, "frac_reward_zero_std": 1.0, "grad_norm": 4.044329946614941e-05, "kl": 0.0167388916015625, "learning_rate": 1.5495935086676533e-05, "loss": 0.0002, "num_tokens": 318176793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3842128429598474, "frac_reward_zero_std": 1.0, "grad_norm": 3.248824732413013e-05, "kl": 0.01727294921875, "learning_rate": 1.5484518712493188e-05, "loss": 0.0002, "num_tokens": 318730105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3846040984007434, "frac_reward_zero_std": 1.0, "grad_norm": 6.223526719650996e-05, "kl": 0.018035888671875, "learning_rate": 1.5473092105791583e-05, "loss": 0.0002, "num_tokens": 319283897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38499535384163935, "frac_reward_zero_std": 1.0, "grad_norm": 4.898812689369625e-05, "kl": 0.018463134765625, "learning_rate": 1.546165528789044e-05, "loss": 0.0002, "num_tokens": 319839417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3853866092825353, "frac_reward_zero_std": 1.0, "grad_norm": 5.945985244953227e-05, "kl": 0.0185089111328125, "learning_rate": 1.5450208280127543e-05, "loss": 0.0002, "num_tokens": 320392809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3857778647234313, "frac_reward_zero_std": 1.0, "grad_norm": 4.619052853880653e-05, "kl": 0.017608642578125, "learning_rate": 1.5438751103859685e-05, "loss": 0.0002, "num_tokens": 320948697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3861691201643273, "frac_reward_zero_std": 1.0, "grad_norm": 3.079636729585329e-05, "kl": 0.019744873046875, "learning_rate": 1.542728378046262e-05, "loss": 0.0002, "num_tokens": 321501049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38656037560522327, "frac_reward_zero_std": 1.0, "grad_norm": 5.52555310291372e-05, "kl": 0.017578125, "learning_rate": 1.541580633133104e-05, "loss": 0.0002, "num_tokens": 322055689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38695163104611924, "frac_reward_zero_std": 1.0, "grad_norm": 3.3524261146256346e-05, "kl": 0.0186004638671875, "learning_rate": 1.5404318777878545e-05, "loss": 0.0002, "num_tokens": 322609161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3873428864870152, "frac_reward_zero_std": 1.0, "grad_norm": 4.590372254062387e-05, "kl": 0.0186767578125, "learning_rate": 1.5392821141537556e-05, "loss": 0.0002, "num_tokens": 323163017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38773414192791117, "frac_reward_zero_std": 1.0, "grad_norm": 2.659799405813481e-05, "kl": 0.0172119140625, "learning_rate": 1.5381313443759328e-05, "loss": 0.0002, "num_tokens": 323719369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38812539736880713, "frac_reward_zero_std": 1.0, "grad_norm": 5.878490730689021e-05, "kl": 0.01727294921875, "learning_rate": 1.536979570601388e-05, "loss": 0.0002, "num_tokens": 324272041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38851665280970316, "frac_reward_zero_std": 1.0, "grad_norm": 3.847046552475951e-05, "kl": 0.01776123046875, "learning_rate": 1.5358267949789968e-05, "loss": 0.0002, "num_tokens": 324825881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3889079082505991, "frac_reward_zero_std": 1.0, "grad_norm": 0.00015128754151049465, "kl": 0.017181396484375, "learning_rate": 1.534673019659503e-05, "loss": 0.0002, "num_tokens": 325380009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3892991636914951, "frac_reward_zero_std": 1.0, "grad_norm": 4.245783663192791e-05, "kl": 0.017822265625, "learning_rate": 1.5335182467955162e-05, "loss": 0.0002, "num_tokens": 325933513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.38969041913239105, "frac_reward_zero_std": 1.0, "grad_norm": 3.1194922868636755e-05, "kl": 0.0179443359375, "learning_rate": 1.532362478541507e-05, "loss": 0.0002, "num_tokens": 326485817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.390081674573287, "frac_reward_zero_std": 1.0, "grad_norm": 9.005290450112272e-05, "kl": 0.0177459716796875, "learning_rate": 1.5312057170538033e-05, "loss": 0.0002, "num_tokens": 327039177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.390472930014183, "frac_reward_zero_std": 1.0, "grad_norm": 3.4580578842132725e-05, "kl": 0.017974853515625, "learning_rate": 1.5300479644905864e-05, "loss": 0.0002, "num_tokens": 327591657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.390864185455079, "frac_reward_zero_std": 1.0, "grad_norm": 0.004211106395797358, "kl": 0.01873779296875, "learning_rate": 1.5288892230118848e-05, "loss": 0.0002, "num_tokens": 328146233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39125544089597497, "frac_reward_zero_std": 1.0, "grad_norm": 4.167194668033827e-05, "kl": 0.0182342529296875, "learning_rate": 1.5277294947795747e-05, "loss": 0.0002, "num_tokens": 328698937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39164669633687094, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009031258025553869, "kl": 0.018585205078125, "learning_rate": 1.526568781957371e-05, "loss": 0.0002, "num_tokens": 329254713.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3920379517777669, "frac_reward_zero_std": 1.0, "grad_norm": 5.0834355973895646e-05, "kl": 0.01678466796875, "learning_rate": 1.5254070867108277e-05, "loss": 0.0002, "num_tokens": 329810665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39242920721866287, "frac_reward_zero_std": 1.0, "grad_norm": 6.327718576029255e-05, "kl": 0.0179290771484375, "learning_rate": 1.5242444112073296e-05, "loss": 0.0002, "num_tokens": 330365385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39282046265955883, "frac_reward_zero_std": 1.0, "grad_norm": 2.388588825826354e-05, "kl": 0.018798828125, "learning_rate": 1.5230807576160915e-05, "loss": 0.0002, "num_tokens": 330919225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39321171810045485, "frac_reward_zero_std": 1.0, "grad_norm": 3.6526517388255295e-05, "kl": 0.018585205078125, "learning_rate": 1.5219161281081536e-05, "loss": 0.0002, "num_tokens": 331472985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3936029735413508, "frac_reward_zero_std": 1.0, "grad_norm": 3.264600813991382e-05, "kl": 0.01849365234375, "learning_rate": 1.5207505248563755e-05, "loss": 0.0002, "num_tokens": 332025753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3939942289822468, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014826117442376368, "kl": 0.0201416015625, "learning_rate": 1.5195839500354337e-05, "loss": 0.0002, "num_tokens": 332579769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39438548442314275, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001543674895083523, "kl": 0.01837158203125, "learning_rate": 1.5184164058218186e-05, "loss": 0.0002, "num_tokens": 333133193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3947767398640387, "frac_reward_zero_std": 1.0, "grad_norm": 2.2038591564254455e-05, "kl": 0.0182647705078125, "learning_rate": 1.5172478943938288e-05, "loss": 0.0002, "num_tokens": 333688937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3951679953049347, "frac_reward_zero_std": 1.0, "grad_norm": 2.3237820891412216e-05, "kl": 0.0183258056640625, "learning_rate": 1.5160784179315662e-05, "loss": 0.0002, "num_tokens": 334241513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3955592507458307, "frac_reward_zero_std": 1.0, "grad_norm": 5.110791433427769e-05, "kl": 0.0182342529296875, "learning_rate": 1.5149079786169344e-05, "loss": 0.0002, "num_tokens": 334794617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39595050618672667, "frac_reward_zero_std": 1.0, "grad_norm": 9.71891933018215e-05, "kl": 0.01806640625, "learning_rate": 1.5137365786336329e-05, "loss": 0.0002, "num_tokens": 335348233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39634176162762264, "frac_reward_zero_std": 1.0, "grad_norm": 5.217753753029074e-05, "kl": 0.017974853515625, "learning_rate": 1.5125642201671543e-05, "loss": 0.0002, "num_tokens": 335901545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3967330170685186, "frac_reward_zero_std": 1.0, "grad_norm": 4.8494223577944766e-05, "kl": 0.0182952880859375, "learning_rate": 1.5113909054047779e-05, "loss": 0.0002, "num_tokens": 336456937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39712427250941457, "frac_reward_zero_std": 1.0, "grad_norm": 4.547640546596552e-05, "kl": 0.01837158203125, "learning_rate": 1.5102166365355688e-05, "loss": 0.0002, "num_tokens": 337012489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39751552795031053, "frac_reward_zero_std": 1.0, "grad_norm": 6.932937251716067e-05, "kl": 0.01763916015625, "learning_rate": 1.5090414157503715e-05, "loss": 0.0002, "num_tokens": 337567577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39790678339120655, "frac_reward_zero_std": 1.0, "grad_norm": 8.261921288236698e-05, "kl": 0.0178070068359375, "learning_rate": 1.5078652452418063e-05, "loss": 0.0002, "num_tokens": 338121497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3982980388321025, "frac_reward_zero_std": 1.0, "grad_norm": 6.0682566210935023e-05, "kl": 0.0181121826171875, "learning_rate": 1.5066881272042655e-05, "loss": 0.0002, "num_tokens": 338675289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3986892942729985, "frac_reward_zero_std": 1.0, "grad_norm": 2.777898121206217e-05, "kl": 0.0179595947265625, "learning_rate": 1.5055100638339094e-05, "loss": 0.0002, "num_tokens": 339229561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.39908054971389445, "frac_reward_zero_std": 1.0, "grad_norm": 4.336359614102243e-05, "kl": 0.0172576904296875, "learning_rate": 1.504331057328662e-05, "loss": 0.0002, "num_tokens": 339784041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3994718051547904, "frac_reward_zero_std": 1.0, "grad_norm": 3.661268469584936e-05, "kl": 0.0187225341796875, "learning_rate": 1.503151109888207e-05, "loss": 0.0002, "num_tokens": 340338217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.3998630605956864, "frac_reward_zero_std": 1.0, "grad_norm": 5.4777562260823255e-05, "kl": 0.018096923828125, "learning_rate": 1.501970223713983e-05, "loss": 0.0002, "num_tokens": 340893257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4002543160365824, "frac_reward_zero_std": 1.0, "grad_norm": 7.266845444288953e-05, "kl": 0.0164642333984375, "learning_rate": 1.5007884010091808e-05, "loss": 0.0002, "num_tokens": 341449097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40064557147747837, "frac_reward_zero_std": 1.0, "grad_norm": 2.2061702921351697e-05, "kl": 0.0177001953125, "learning_rate": 1.4996056439787384e-05, "loss": 0.0002, "num_tokens": 342001449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40103682691837433, "frac_reward_zero_std": 1.0, "grad_norm": 2.9853209512747988e-05, "kl": 0.01971435546875, "learning_rate": 1.4984219548293361e-05, "loss": 0.0002, "num_tokens": 342554585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4014280823592703, "frac_reward_zero_std": 1.0, "grad_norm": 9.840665378651952e-05, "kl": 0.0169677734375, "learning_rate": 1.4972373357693945e-05, "loss": 0.0002, "num_tokens": 343109001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40181933780016627, "frac_reward_zero_std": 1.0, "grad_norm": 5.0513789181219305e-05, "kl": 0.018218994140625, "learning_rate": 1.4960517890090683e-05, "loss": 0.0002, "num_tokens": 343663577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40221059324106223, "frac_reward_zero_std": 1.0, "grad_norm": 2.2543849400416486e-05, "kl": 0.0186614990234375, "learning_rate": 1.494865316760243e-05, "loss": 0.0002, "num_tokens": 344215273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40260184868195825, "frac_reward_zero_std": 1.0, "grad_norm": 6.0424338885178386e-05, "kl": 0.0172576904296875, "learning_rate": 1.4936779212365317e-05, "loss": 0.0002, "num_tokens": 344768185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4029931041228542, "frac_reward_zero_std": 1.0, "grad_norm": 5.0840533404418224e-05, "kl": 0.01739501953125, "learning_rate": 1.492489604653269e-05, "loss": 0.0002, "num_tokens": 345324249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4033843595637502, "frac_reward_zero_std": 1.0, "grad_norm": 2.981241288546441e-05, "kl": 0.01800537109375, "learning_rate": 1.4913003692275084e-05, "loss": 0.0002, "num_tokens": 345876809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40377561500464615, "frac_reward_zero_std": 1.0, "grad_norm": 5.596798209226456e-05, "kl": 0.018157958984375, "learning_rate": 1.4901102171780175e-05, "loss": 0.0002, "num_tokens": 346430025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4041668704455421, "frac_reward_zero_std": 1.0, "grad_norm": 5.652733870576893e-05, "kl": 0.01849365234375, "learning_rate": 1.4889191507252743e-05, "loss": 0.0002, "num_tokens": 346982505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40455812588643814, "frac_reward_zero_std": 1.0, "grad_norm": 4.949823452909962e-05, "kl": 0.01800537109375, "learning_rate": 1.487727172091463e-05, "loss": 0.0002, "num_tokens": 347535897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4049493813273341, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001330208896145318, "kl": 0.018707275390625, "learning_rate": 1.4865342835004688e-05, "loss": 0.0002, "num_tokens": 348088185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40534063676823007, "frac_reward_zero_std": 1.0, "grad_norm": 3.6607407230029615e-05, "kl": 0.0186004638671875, "learning_rate": 1.4853404871778759e-05, "loss": 0.0002, "num_tokens": 348641737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40573189220912603, "frac_reward_zero_std": 1.0, "grad_norm": 0.001072031929303753, "kl": 0.018951416015625, "learning_rate": 1.4841457853509606e-05, "loss": 0.0002, "num_tokens": 349200009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.406123147650022, "frac_reward_zero_std": 1.0, "grad_norm": 7.656844255578937e-05, "kl": 0.0179901123046875, "learning_rate": 1.4829501802486899e-05, "loss": 0.0002, "num_tokens": 349753881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40651440309091796, "frac_reward_zero_std": 1.0, "grad_norm": 7.850837341028693e-05, "kl": 0.0178680419921875, "learning_rate": 1.4817536741017153e-05, "loss": 0.0002, "num_tokens": 350305449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.406905658531814, "frac_reward_zero_std": 1.0, "grad_norm": 5.617540729007727e-05, "kl": 0.0185394287109375, "learning_rate": 1.4805562691423698e-05, "loss": 0.0002, "num_tokens": 350860329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40729691397270995, "frac_reward_zero_std": 1.0, "grad_norm": 5.5619643812790315e-05, "kl": 0.0182952880859375, "learning_rate": 1.479357967604663e-05, "loss": 0.0002, "num_tokens": 351414905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4076881694136059, "frac_reward_zero_std": 1.0, "grad_norm": 6.524427816664224e-05, "kl": 0.019439697265625, "learning_rate": 1.4781587717242772e-05, "loss": 0.0002, "num_tokens": 351968857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4080794248545019, "frac_reward_zero_std": 1.0, "grad_norm": 5.368685612501452e-05, "kl": 0.0185394287109375, "learning_rate": 1.4769586837385635e-05, "loss": 0.0002, "num_tokens": 352522649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40847068029539785, "frac_reward_zero_std": 1.0, "grad_norm": 5.416701442536458e-05, "kl": 0.017547607421875, "learning_rate": 1.4757577058865376e-05, "loss": 0.0002, "num_tokens": 353075065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4088619357362938, "frac_reward_zero_std": 1.0, "grad_norm": 6.432118771418734e-05, "kl": 0.01751708984375, "learning_rate": 1.4745558404088751e-05, "loss": 0.0002, "num_tokens": 353629593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.40925319117718983, "frac_reward_zero_std": 1.0, "grad_norm": 5.7291901109845654e-05, "kl": 0.017913818359375, "learning_rate": 1.4733530895479074e-05, "loss": 0.0002, "num_tokens": 354184905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4096444466180858, "frac_reward_zero_std": 1.0, "grad_norm": 0.00018918720944672733, "kl": 0.018402099609375, "learning_rate": 1.4721494555476189e-05, "loss": 0.0002, "num_tokens": 354738841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41003570205898177, "frac_reward_zero_std": 1.0, "grad_norm": 5.492109254856141e-05, "kl": 0.0178680419921875, "learning_rate": 1.47094494065364e-05, "loss": 0.0002, "num_tokens": 355293561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41042695749987773, "frac_reward_zero_std": 1.0, "grad_norm": 4.93082571648071e-05, "kl": 0.017852783203125, "learning_rate": 1.469739547113246e-05, "loss": 0.0002, "num_tokens": 355848329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4108182129407737, "frac_reward_zero_std": 1.0, "grad_norm": 6.810104614424888e-05, "kl": 0.0184783935546875, "learning_rate": 1.4685332771753508e-05, "loss": 0.0002, "num_tokens": 356400985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41120946838166966, "frac_reward_zero_std": 1.0, "grad_norm": 4.530705960060388e-05, "kl": 0.0188751220703125, "learning_rate": 1.4673261330905043e-05, "loss": 0.0002, "num_tokens": 356954441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4116007238225657, "frac_reward_zero_std": 1.0, "grad_norm": 5.4425382882883915e-05, "kl": 0.0173492431640625, "learning_rate": 1.4661181171108855e-05, "loss": 0.0002, "num_tokens": 357508473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41199197926346165, "frac_reward_zero_std": 1.0, "grad_norm": 6.026745731634346e-05, "kl": 0.0178680419921875, "learning_rate": 1.4649092314903018e-05, "loss": 0.0002, "num_tokens": 358062281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4123832347043576, "frac_reward_zero_std": 1.0, "grad_norm": 7.314972016975635e-05, "kl": 0.0173187255859375, "learning_rate": 1.4636994784841824e-05, "loss": 0.0002, "num_tokens": 358616841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4127744901452536, "frac_reward_zero_std": 1.0, "grad_norm": 7.539258076239396e-05, "kl": 0.0174713134765625, "learning_rate": 1.462488860349575e-05, "loss": 0.0002, "num_tokens": 359171897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41316574558614955, "frac_reward_zero_std": 1.0, "grad_norm": 3.965490649471961e-05, "kl": 0.0183258056640625, "learning_rate": 1.4612773793451411e-05, "loss": 0.0002, "num_tokens": 359723993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4135570010270455, "frac_reward_zero_std": 1.0, "grad_norm": 5.925104814027132e-05, "kl": 0.0174102783203125, "learning_rate": 1.4600650377311523e-05, "loss": 0.0002, "num_tokens": 360275961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41394825646794153, "frac_reward_zero_std": 1.0, "grad_norm": 7.110995914486798e-05, "kl": 0.0181427001953125, "learning_rate": 1.4588518377694856e-05, "loss": 0.0002, "num_tokens": 360829081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4143395119088375, "frac_reward_zero_std": 1.0, "grad_norm": 4.636044713913235e-05, "kl": 0.0180816650390625, "learning_rate": 1.4576377817236202e-05, "loss": 0.0002, "num_tokens": 361380553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41473076734973346, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001824836182362349, "kl": 0.0205230712890625, "learning_rate": 1.456422871858631e-05, "loss": 0.0002, "num_tokens": 361936985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41512202279062943, "frac_reward_zero_std": 1.0, "grad_norm": 4.427809340810462e-05, "kl": 0.01806640625, "learning_rate": 1.4552071104411874e-05, "loss": 0.0002, "num_tokens": 362491609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4155132782315254, "frac_reward_zero_std": 1.0, "grad_norm": 8.028556760511103e-05, "kl": 0.01776123046875, "learning_rate": 1.4539904997395468e-05, "loss": 0.0002, "num_tokens": 363046089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41590453367242136, "frac_reward_zero_std": 1.0, "grad_norm": 4.386739588478481e-05, "kl": 0.017913818359375, "learning_rate": 1.4527730420235515e-05, "loss": 0.0002, "num_tokens": 363600249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4162957891133174, "frac_reward_zero_std": 1.0, "grad_norm": 4.455046239048007e-05, "kl": 0.018341064453125, "learning_rate": 1.4515547395646232e-05, "loss": 0.0002, "num_tokens": 364153273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41668704455421335, "frac_reward_zero_std": 1.0, "grad_norm": 0.0001492733447194067, "kl": 0.0170745849609375, "learning_rate": 1.450335594635761e-05, "loss": 0.0002, "num_tokens": 364707241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4170782999951093, "frac_reward_zero_std": 1.0, "grad_norm": 4.751482622972116e-05, "kl": 0.0183258056640625, "learning_rate": 1.4491156095115346e-05, "loss": 0.0002, "num_tokens": 365260153.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4174695554360053, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019397138618319575, "kl": 0.0191192626953125, "learning_rate": 1.4478947864680821e-05, "loss": 0.0002, "num_tokens": 365812681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41786081087690125, "frac_reward_zero_std": 1.0, "grad_norm": 4.662993765181419e-05, "kl": 0.017913818359375, "learning_rate": 1.4466731277831043e-05, "loss": 0.0002, "num_tokens": 366364585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4182520663177972, "frac_reward_zero_std": 1.0, "grad_norm": 2.6321692192419818e-05, "kl": 0.0176544189453125, "learning_rate": 1.4454506357358611e-05, "loss": 0.0002, "num_tokens": 366919865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41864332175869323, "frac_reward_zero_std": 1.0, "grad_norm": 5.9589153265186886e-05, "kl": 0.0194244384765625, "learning_rate": 1.4442273126071683e-05, "loss": 0.0002, "num_tokens": 367473305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4190345771995892, "frac_reward_zero_std": 1.0, "grad_norm": 5.2995635486934746e-05, "kl": 0.018310546875, "learning_rate": 1.4430031606793904e-05, "loss": 0.0002, "num_tokens": 368025657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41942583264048516, "frac_reward_zero_std": 1.0, "grad_norm": 4.258012565079482e-05, "kl": 0.0177459716796875, "learning_rate": 1.4417781822364396e-05, "loss": 0.0002, "num_tokens": 368580073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.41981708808138113, "frac_reward_zero_std": 1.0, "grad_norm": 3.7786100671463204e-05, "kl": 0.0189971923828125, "learning_rate": 1.44055237956377e-05, "loss": 0.0002, "num_tokens": 369135177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4202083435222771, "frac_reward_zero_std": 1.0, "grad_norm": 3.937157938153501e-05, "kl": 0.018280029296875, "learning_rate": 1.4393257549483728e-05, "loss": 0.0002, "num_tokens": 369691417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42059959896317306, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010774757801657697, "kl": 0.018402099609375, "learning_rate": 1.4380983106787731e-05, "loss": 0.0002, "num_tokens": 370244313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4209908544040691, "frac_reward_zero_std": 1.0, "grad_norm": 6.0799005975045266e-05, "kl": 0.0191497802734375, "learning_rate": 1.4368700490450256e-05, "loss": 0.0002, "num_tokens": 370797913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42138210984496505, "frac_reward_zero_std": 1.0, "grad_norm": 6.084396355855657e-05, "kl": 0.0184173583984375, "learning_rate": 1.4356409723387092e-05, "loss": 0.0002, "num_tokens": 371350873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.421773365285861, "frac_reward_zero_std": 1.0, "grad_norm": 3.961026320228599e-05, "kl": 0.0175933837890625, "learning_rate": 1.4344110828529246e-05, "loss": 0.0002, "num_tokens": 371906233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.422164620726757, "frac_reward_zero_std": 1.0, "grad_norm": 5.728486929703155e-05, "kl": 0.018157958984375, "learning_rate": 1.4331803828822873e-05, "loss": 0.0002, "num_tokens": 372459241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42255587616765294, "frac_reward_zero_std": 1.0, "grad_norm": 3.693501763815895e-05, "kl": 0.0179443359375, "learning_rate": 1.4319488747229262e-05, "loss": 0.0002, "num_tokens": 373012937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4229471316085489, "frac_reward_zero_std": 1.0, "grad_norm": 2.3833344922862665e-05, "kl": 0.0179290771484375, "learning_rate": 1.4307165606724777e-05, "loss": 0.0002, "num_tokens": 373567961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42333838704944493, "frac_reward_zero_std": 1.0, "grad_norm": 4.5129308105087905e-05, "kl": 0.0172119140625, "learning_rate": 1.4294834430300822e-05, "loss": 0.0002, "num_tokens": 374123929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4237296424903409, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012722019593811452, "kl": 0.0179595947265625, "learning_rate": 1.4282495240963781e-05, "loss": 0.0002, "num_tokens": 374677497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42412089793123686, "frac_reward_zero_std": 1.0, "grad_norm": 0.2707585501645407, "kl": 0.1249237060546875, "learning_rate": 1.4270148061734999e-05, "loss": 0.0013, "num_tokens": 375230985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4245121533721328, "frac_reward_zero_std": 1.0, "grad_norm": 8.145685019760654e-05, "kl": 0.0186004638671875, "learning_rate": 1.4257792915650728e-05, "loss": 0.0002, "num_tokens": 375785625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4249034088130288, "frac_reward_zero_std": 1.0, "grad_norm": 0.00012104313156606478, "kl": 0.017608642578125, "learning_rate": 1.4245429825762078e-05, "loss": 0.0002, "num_tokens": 376340729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42529466425392476, "frac_reward_zero_std": 1.0, "grad_norm": 8.741244626592673e-05, "kl": 0.0186614990234375, "learning_rate": 1.4233058815134978e-05, "loss": 0.0002, "num_tokens": 376893833.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4256859196948208, "frac_reward_zero_std": 1.0, "grad_norm": 0.002080500849004027, "kl": 0.0196380615234375, "learning_rate": 1.4220679906850148e-05, "loss": 0.0002, "num_tokens": 377445673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42607717513571675, "frac_reward_zero_std": 1.0, "grad_norm": 0.00017929329766540333, "kl": 0.0184783935546875, "learning_rate": 1.4208293124003028e-05, "loss": 0.0002, "num_tokens": 378000121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4264684305766127, "frac_reward_zero_std": 1.0, "grad_norm": 0.00019424174088163908, "kl": 0.0196380615234375, "learning_rate": 1.4195898489703757e-05, "loss": 0.0002, "num_tokens": 378554873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4268596860175087, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009892781256851085, "kl": 0.0181427001953125, "learning_rate": 1.4183496027077119e-05, "loss": 0.0002, "num_tokens": 379109593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42725094145840464, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004618486004199626, "kl": 0.018402099609375, "learning_rate": 1.4171085759262507e-05, "loss": 0.0002, "num_tokens": 379663401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4276421968993006, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007799100230892077, "kl": 0.019927978515625, "learning_rate": 1.4158667709413877e-05, "loss": 0.0002, "num_tokens": 380215209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42803345234019663, "frac_reward_zero_std": 1.0, "grad_norm": 0.000348511686317043, "kl": 0.0204925537109375, "learning_rate": 1.41462419006997e-05, "loss": 0.0002, "num_tokens": 380769881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4284247077810926, "frac_reward_zero_std": 1.0, "grad_norm": 0.0004993141693963123, "kl": 0.019317626953125, "learning_rate": 1.413380835630292e-05, "loss": 0.0002, "num_tokens": 381322329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42881596322198856, "frac_reward_zero_std": 1.0, "grad_norm": 0.000818288678338089, "kl": 0.020721435546875, "learning_rate": 1.4121367099420926e-05, "loss": 0.0002, "num_tokens": 381876985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4292072186628845, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007298995983162091, "kl": 0.0216522216796875, "learning_rate": 1.4108918153265485e-05, "loss": 0.0002, "num_tokens": 382432553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4295984741037805, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006704662337062795, "kl": 0.023162841796875, "learning_rate": 1.409646154106271e-05, "loss": 0.0002, "num_tokens": 382985945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.42998972954467646, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008706994939987274, "kl": 0.0205535888671875, "learning_rate": 1.4083997286053022e-05, "loss": 0.0002, "num_tokens": 383541449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4303809849855725, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010643218274030484, "kl": 0.020416259765625, "learning_rate": 1.4071525411491099e-05, "loss": 0.0002, "num_tokens": 384095657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43077224042646844, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014721291323511276, "kl": 0.0235595703125, "learning_rate": 1.4059045940645834e-05, "loss": 0.0002, "num_tokens": 384647961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4311634958673644, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008440169941646078, "kl": 0.0217132568359375, "learning_rate": 1.4046558896800299e-05, "loss": 0.0002, "num_tokens": 385203929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4315547513082604, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011693935967236105, "kl": 0.0229949951171875, "learning_rate": 1.4034064303251683e-05, "loss": 0.0002, "num_tokens": 385759049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43194600674915634, "frac_reward_zero_std": 1.0, "grad_norm": 0.0037515398606920835, "kl": 0.024749755859375, "learning_rate": 1.4021562183311268e-05, "loss": 0.0002, "num_tokens": 386313305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4323372621900523, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015766741630857761, "kl": 0.0230712890625, "learning_rate": 1.400905256030438e-05, "loss": 0.0002, "num_tokens": 386865385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43272851763094833, "frac_reward_zero_std": 1.0, "grad_norm": 0.0007440629453854054, "kl": 0.0237579345703125, "learning_rate": 1.3996535457570335e-05, "loss": 0.0002, "num_tokens": 387418665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4331197730718443, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012258222712041762, "kl": 0.024810791015625, "learning_rate": 1.3984010898462417e-05, "loss": 0.0002, "num_tokens": 387973561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43351102851274026, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017467188382207074, "kl": 0.0242462158203125, "learning_rate": 1.3971478906347806e-05, "loss": 0.0002, "num_tokens": 388527209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4339022839536362, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016214474049901317, "kl": 0.0243377685546875, "learning_rate": 1.3958939504607565e-05, "loss": 0.0002, "num_tokens": 389081113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4342935393945322, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010647026711535917, "kl": 0.0224761962890625, "learning_rate": 1.3946392716636573e-05, "loss": 0.0002, "num_tokens": 389633129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43468479483542816, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011756261211425577, "kl": 0.02484130859375, "learning_rate": 1.3933838565843485e-05, "loss": 0.0002, "num_tokens": 390185481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4350760502763242, "frac_reward_zero_std": 1.0, "grad_norm": 0.001033076156228097, "kl": 0.0234375, "learning_rate": 1.3921277075650703e-05, "loss": 0.0002, "num_tokens": 390740041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43546730571722014, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014115325867740422, "kl": 0.0232696533203125, "learning_rate": 1.3908708269494318e-05, "loss": 0.0002, "num_tokens": 391294505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4358585611581161, "frac_reward_zero_std": 1.0, "grad_norm": 0.02068679264524458, "kl": 0.02520751953125, "learning_rate": 1.3896132170824067e-05, "loss": 0.0003, "num_tokens": 391847097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4362498165990121, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017234472088343486, "kl": 0.0228271484375, "learning_rate": 1.3883548803103298e-05, "loss": 0.0002, "num_tokens": 392401001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43664107203990804, "frac_reward_zero_std": 1.0, "grad_norm": 0.001961658109522559, "kl": 0.0241851806640625, "learning_rate": 1.3870958189808921e-05, "loss": 0.0002, "num_tokens": 392953449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.437032327480804, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020704581510648623, "kl": 0.0234375, "learning_rate": 1.3858360354431355e-05, "loss": 0.0002, "num_tokens": 393507673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4374235829217, "frac_reward_zero_std": 1.0, "grad_norm": 0.001558156341240675, "kl": 0.0248260498046875, "learning_rate": 1.3845755320474505e-05, "loss": 0.0002, "num_tokens": 394065193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.437814838362596, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020339378802988837, "kl": 0.0234832763671875, "learning_rate": 1.3833143111455705e-05, "loss": 0.0002, "num_tokens": 394620169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43820609380349196, "frac_reward_zero_std": 1.0, "grad_norm": 0.0018188090679061688, "kl": 0.0238800048828125, "learning_rate": 1.3820523750905667e-05, "loss": 0.0002, "num_tokens": 395174105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4385973492443879, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013863253837138243, "kl": 0.0261383056640625, "learning_rate": 1.3807897262368453e-05, "loss": 0.0003, "num_tokens": 395727081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4389886046852839, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012596922852855246, "kl": 0.02484130859375, "learning_rate": 1.379526366940142e-05, "loss": 0.0002, "num_tokens": 396280873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.43937986012617986, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021886034923756428, "kl": 0.0232391357421875, "learning_rate": 1.3782622995575182e-05, "loss": 0.0002, "num_tokens": 396835257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4397711155670759, "frac_reward_zero_std": 1.0, "grad_norm": 0.002860137456080648, "kl": 0.0252838134765625, "learning_rate": 1.3769975264473568e-05, "loss": 0.0003, "num_tokens": 397388121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44016237100797184, "frac_reward_zero_std": 1.0, "grad_norm": 0.011298536484324261, "kl": 0.0291900634765625, "learning_rate": 1.3757320499693561e-05, "loss": 0.0003, "num_tokens": 397940025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4405536264488678, "frac_reward_zero_std": 1.0, "grad_norm": 0.002630463642499909, "kl": 0.0238189697265625, "learning_rate": 1.3744658724845278e-05, "loss": 0.0002, "num_tokens": 398492377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4409448818897638, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020156205940459627, "kl": 0.02490234375, "learning_rate": 1.3731989963551916e-05, "loss": 0.0002, "num_tokens": 399045097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44133613733065974, "frac_reward_zero_std": 1.0, "grad_norm": 0.001684320241017846, "kl": 0.0237579345703125, "learning_rate": 1.3719314239449696e-05, "loss": 0.0002, "num_tokens": 399598857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4417273927715557, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031547848063410654, "kl": 0.024383544921875, "learning_rate": 1.3706631576187842e-05, "loss": 0.0002, "num_tokens": 400154201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4421186482124517, "frac_reward_zero_std": 1.0, "grad_norm": 0.002721232921971718, "kl": 0.02496337890625, "learning_rate": 1.3693941997428514e-05, "loss": 0.0003, "num_tokens": 400710953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4425099036533477, "frac_reward_zero_std": 1.0, "grad_norm": 0.04551930654313305, "kl": 0.04638671875, "learning_rate": 1.3681245526846782e-05, "loss": 0.0005, "num_tokens": 401263961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44290115909424366, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009304429431551499, "kl": 0.02337646484375, "learning_rate": 1.3668542188130567e-05, "loss": 0.0002, "num_tokens": 401817801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4432924145351396, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013007470249319448, "kl": 0.02471923828125, "learning_rate": 1.365583200498061e-05, "loss": 0.0002, "num_tokens": 402369897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4436836699760356, "frac_reward_zero_std": 1.0, "grad_norm": 0.001183733951829245, "kl": 0.023223876953125, "learning_rate": 1.3643115001110419e-05, "loss": 0.0002, "num_tokens": 402923721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44407492541693155, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010573119092817035, "kl": 0.0245361328125, "learning_rate": 1.3630391200246228e-05, "loss": 0.0002, "num_tokens": 403476409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4444661808578276, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017492309634719906, "kl": 0.0244598388671875, "learning_rate": 1.3617660626126957e-05, "loss": 0.0002, "num_tokens": 404029033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44485743629872354, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013625578505471927, "kl": 0.0247650146484375, "learning_rate": 1.3604923302504146e-05, "loss": 0.0002, "num_tokens": 404581705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4452486917396195, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017130815887321498, "kl": 0.0246734619140625, "learning_rate": 1.3592179253141955e-05, "loss": 0.0002, "num_tokens": 405135097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44563994718051547, "frac_reward_zero_std": 1.0, "grad_norm": 0.002697456291285138, "kl": 0.025665283203125, "learning_rate": 1.3579428501817067e-05, "loss": 0.0003, "num_tokens": 405687913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44603120262141144, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015932716353403479, "kl": 0.024383544921875, "learning_rate": 1.3566671072318678e-05, "loss": 0.0002, "num_tokens": 406242809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4464224580623074, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016409035265932137, "kl": 0.0241851806640625, "learning_rate": 1.3553906988448451e-05, "loss": 0.0002, "num_tokens": 406795753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4468137135032034, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013796847892491402, "kl": 0.023101806640625, "learning_rate": 1.3541136274020452e-05, "loss": 0.0002, "num_tokens": 407347785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4472049689440994, "frac_reward_zero_std": 1.0, "grad_norm": 0.0022701764723991025, "kl": 0.0257720947265625, "learning_rate": 1.352835895286113e-05, "loss": 0.0003, "num_tokens": 407900569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44759622438499536, "frac_reward_zero_std": 1.0, "grad_norm": 0.001957305991291238, "kl": 0.0243988037109375, "learning_rate": 1.3515575048809247e-05, "loss": 0.0002, "num_tokens": 408454745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4479874798258913, "frac_reward_zero_std": 1.0, "grad_norm": 0.03421899351984684, "kl": 0.0402984619140625, "learning_rate": 1.3502784585715853e-05, "loss": 0.0004, "num_tokens": 409007161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4483787352667873, "frac_reward_zero_std": 1.0, "grad_norm": 0.001283006502265639, "kl": 0.0258026123046875, "learning_rate": 1.3489987587444241e-05, "loss": 0.0003, "num_tokens": 409565433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44876999070768325, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010329649091421022, "kl": 0.0236663818359375, "learning_rate": 1.3477184077869892e-05, "loss": 0.0002, "num_tokens": 410118857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4491612461485793, "frac_reward_zero_std": 1.0, "grad_norm": 0.002204535764136053, "kl": 0.027130126953125, "learning_rate": 1.3464374080880425e-05, "loss": 0.0003, "num_tokens": 410673529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.44955250158947524, "frac_reward_zero_std": 1.0, "grad_norm": 0.006230225710947427, "kl": 0.0271453857421875, "learning_rate": 1.3451557620375578e-05, "loss": 0.0003, "num_tokens": 411226761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4499437570303712, "frac_reward_zero_std": 1.0, "grad_norm": 0.001740300027765016, "kl": 0.026763916015625, "learning_rate": 1.3438734720267145e-05, "loss": 0.0003, "num_tokens": 411780185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45033501247126717, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009860464932299204, "kl": 0.0265045166015625, "learning_rate": 1.3425905404478929e-05, "loss": 0.0003, "num_tokens": 412332201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45072626791216314, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008031114806501887, "kl": 0.0271453857421875, "learning_rate": 1.3413069696946706e-05, "loss": 0.0003, "num_tokens": 412885545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4511175233530591, "frac_reward_zero_std": 1.0, "grad_norm": 0.0020290602256862003, "kl": 0.02593994140625, "learning_rate": 1.340022762161817e-05, "loss": 0.0003, "num_tokens": 413439529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4515087787939551, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019371743416002124, "kl": 0.0268707275390625, "learning_rate": 1.3387379202452917e-05, "loss": 0.0003, "num_tokens": 413992265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4519000342348511, "frac_reward_zero_std": 1.0, "grad_norm": 0.001886638517361769, "kl": 0.02447509765625, "learning_rate": 1.337452446342235e-05, "loss": 0.0002, "num_tokens": 414548025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45229128967574705, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019348412070726425, "kl": 0.0250701904296875, "learning_rate": 1.3361663428509679e-05, "loss": 0.0003, "num_tokens": 415101593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.452682545116643, "frac_reward_zero_std": 1.0, "grad_norm": 0.0021476479348639427, "kl": 0.0257415771484375, "learning_rate": 1.3348796121709862e-05, "loss": 0.0003, "num_tokens": 415655801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.453073800557539, "frac_reward_zero_std": 1.0, "grad_norm": 0.0009793577049822084, "kl": 0.0279083251953125, "learning_rate": 1.3335922567029556e-05, "loss": 0.0003, "num_tokens": 416210057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45346505599843495, "frac_reward_zero_std": 1.0, "grad_norm": 0.0016616971053086214, "kl": 0.0261993408203125, "learning_rate": 1.3323042788487066e-05, "loss": 0.0003, "num_tokens": 416762889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.453856311439331, "frac_reward_zero_std": 1.0, "grad_norm": 0.002507566271263632, "kl": 0.0245513916015625, "learning_rate": 1.3310156810112322e-05, "loss": 0.0002, "num_tokens": 417318473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45424756688022694, "frac_reward_zero_std": 1.0, "grad_norm": 0.0012117655307809275, "kl": 0.0259246826171875, "learning_rate": 1.3297264655946816e-05, "loss": 0.0003, "num_tokens": 417871849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4546388223211229, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015344312184129504, "kl": 0.024169921875, "learning_rate": 1.3284366350043558e-05, "loss": 0.0002, "num_tokens": 418426601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45503007776201887, "frac_reward_zero_std": 1.0, "grad_norm": 0.009497436471724617, "kl": 0.03131103515625, "learning_rate": 1.3271461916467038e-05, "loss": 0.0003, "num_tokens": 418979769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45542133320291484, "frac_reward_zero_std": 1.0, "grad_norm": 0.001864288606044375, "kl": 0.0255279541015625, "learning_rate": 1.3258551379293185e-05, "loss": 0.0003, "num_tokens": 419532121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4558125886438108, "frac_reward_zero_std": 1.0, "grad_norm": 0.0019983490956044536, "kl": 0.0264892578125, "learning_rate": 1.3245634762609308e-05, "loss": 0.0003, "num_tokens": 420086089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4562038440847068, "frac_reward_zero_std": 1.0, "grad_norm": 0.002205334670543926, "kl": 0.0247039794921875, "learning_rate": 1.3232712090514057e-05, "loss": 0.0002, "num_tokens": 420638681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4565950995256028, "frac_reward_zero_std": 1.0, "grad_norm": 0.001253663397359313, "kl": 0.02655029296875, "learning_rate": 1.3219783387117384e-05, "loss": 0.0003, "num_tokens": 421192537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45698635496649875, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014729992439247563, "kl": 0.0247344970703125, "learning_rate": 1.3206848676540495e-05, "loss": 0.0002, "num_tokens": 421745913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4573776104073947, "frac_reward_zero_std": 1.0, "grad_norm": 0.07768161662226224, "kl": 0.03216552734375, "learning_rate": 1.31939079829158e-05, "loss": 0.0003, "num_tokens": 422300009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4577688658482907, "frac_reward_zero_std": 1.0, "grad_norm": 0.003397552297000635, "kl": 0.0280303955078125, "learning_rate": 1.3180961330386876e-05, "loss": 0.0003, "num_tokens": 422852249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45816012128918665, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013860991532605343, "kl": 0.031585693359375, "learning_rate": 1.3168008743108412e-05, "loss": 0.0003, "num_tokens": 423406825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45855137673008267, "frac_reward_zero_std": 1.0, "grad_norm": 0.0013430226383218454, "kl": 0.03387451171875, "learning_rate": 1.3155050245246171e-05, "loss": 0.0003, "num_tokens": 423960537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45894263217097864, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014299907728948125, "kl": 0.03448486328125, "learning_rate": 1.3142085860976947e-05, "loss": 0.0003, "num_tokens": 424513417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4593338876118746, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011916069576289507, "kl": 0.0338134765625, "learning_rate": 1.3129115614488512e-05, "loss": 0.0003, "num_tokens": 425067897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.45972514305277057, "frac_reward_zero_std": 1.0, "grad_norm": 0.008690056914529374, "kl": 0.03839111328125, "learning_rate": 1.3116139529979579e-05, "loss": 0.0004, "num_tokens": 425622761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46011639849366653, "frac_reward_zero_std": 1.0, "grad_norm": 0.0014224795526698504, "kl": 0.0341796875, "learning_rate": 1.310315763165975e-05, "loss": 0.0003, "num_tokens": 426177993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46050765393456256, "frac_reward_zero_std": 1.0, "grad_norm": 0.0006753677225440645, "kl": 0.035736083984375, "learning_rate": 1.3090169943749475e-05, "loss": 0.0004, "num_tokens": 426732521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4608989093754585, "frac_reward_zero_std": 1.0, "grad_norm": 0.002057427314103889, "kl": 0.03411865234375, "learning_rate": 1.3077176490480008e-05, "loss": 0.0003, "num_tokens": 427285577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4612901648163545, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008398062321384095, "kl": 0.033294677734375, "learning_rate": 1.3064177296093355e-05, "loss": 0.0003, "num_tokens": 427838937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46168142025725045, "frac_reward_zero_std": 1.0, "grad_norm": 0.001031131165772159, "kl": 0.0343017578125, "learning_rate": 1.3051172384842233e-05, "loss": 0.0003, "num_tokens": 428392217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4620726756981464, "frac_reward_zero_std": 1.0, "grad_norm": 0.006353442063380532, "kl": 0.03680419921875, "learning_rate": 1.3038161780990036e-05, "loss": 0.0004, "num_tokens": 428947113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4624639311390424, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027715239250787256, "kl": 0.03643798828125, "learning_rate": 1.302514550881076e-05, "loss": 0.0004, "num_tokens": 429501321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4628551865799384, "frac_reward_zero_std": 1.0, "grad_norm": 0.001297393772279838, "kl": 0.034912109375, "learning_rate": 1.3012123592588991e-05, "loss": 0.0003, "num_tokens": 430054505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46324644202083437, "frac_reward_zero_std": 1.0, "grad_norm": 0.0008174320485933794, "kl": 0.033905029296875, "learning_rate": 1.2999096056619844e-05, "loss": 0.0003, "num_tokens": 430606425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46363769746173034, "frac_reward_zero_std": 1.0, "grad_norm": 0.001462752115379712, "kl": 0.0335693359375, "learning_rate": 1.2986062925208913e-05, "loss": 0.0003, "num_tokens": 431161225.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4640289529026263, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010129652019039929, "kl": 0.034576416015625, "learning_rate": 1.2973024222672233e-05, "loss": 0.0003, "num_tokens": 431716185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46442020834352227, "frac_reward_zero_std": 1.0, "grad_norm": 0.0011049119090875339, "kl": 0.034912109375, "learning_rate": 1.2959979973336236e-05, "loss": 0.0003, "num_tokens": 432270425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46481146378441823, "frac_reward_zero_std": 1.0, "grad_norm": 0.0010365913680600507, "kl": 0.033843994140625, "learning_rate": 1.29469302015377e-05, "loss": 0.0003, "num_tokens": 432823737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46520271922531425, "frac_reward_zero_std": 1.0, "grad_norm": 0.0031490460666917268, "kl": 0.034759521484375, "learning_rate": 1.2933874931623707e-05, "loss": 0.0003, "num_tokens": 433379577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4655939746662102, "frac_reward_zero_std": 1.0, "grad_norm": 0.0036456279187956344, "kl": 0.036163330078125, "learning_rate": 1.2920814187951598e-05, "loss": 0.0004, "num_tokens": 433931033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4659852301071062, "frac_reward_zero_std": 1.0, "grad_norm": 0.12335906892071735, "kl": 0.069366455078125, "learning_rate": 1.2907747994888923e-05, "loss": 0.0007, "num_tokens": 434486585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46637648554800215, "frac_reward_zero_std": 1.0, "grad_norm": 0.011194688891074753, "kl": 0.038482666015625, "learning_rate": 1.289467637681341e-05, "loss": 0.0004, "num_tokens": 435040233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4667677409888981, "frac_reward_zero_std": 1.0, "grad_norm": 0.004187991634041862, "kl": 0.039794921875, "learning_rate": 1.2881599358112888e-05, "loss": 0.0004, "num_tokens": 435592665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4671589964297941, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027306509138631204, "kl": 0.041046142578125, "learning_rate": 1.286851696318528e-05, "loss": 0.0004, "num_tokens": 436145993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4675502518706901, "frac_reward_zero_std": 1.0, "grad_norm": 0.011327632730489439, "kl": 0.04754638671875, "learning_rate": 1.2855429216438538e-05, "loss": 0.0005, "num_tokens": 436698457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46794150731158607, "frac_reward_zero_std": 1.0, "grad_norm": 0.002736728879434467, "kl": 0.042205810546875, "learning_rate": 1.2842336142290587e-05, "loss": 0.0004, "num_tokens": 437254665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46833276275248203, "frac_reward_zero_std": 1.0, "grad_norm": 0.029096585744291414, "kl": 0.06109619140625, "learning_rate": 1.28292377651693e-05, "loss": 0.0006, "num_tokens": 437806905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.468724018193378, "frac_reward_zero_std": 1.0, "grad_norm": 0.004709872832398759, "kl": 0.04791259765625, "learning_rate": 1.2816134109512448e-05, "loss": 0.0005, "num_tokens": 438359657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46911527363427397, "frac_reward_zero_std": 1.0, "grad_norm": 0.013039751391344435, "kl": 0.05377197265625, "learning_rate": 1.2803025199767638e-05, "loss": 0.0005, "num_tokens": 438913769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46950652907516993, "frac_reward_zero_std": 1.0, "grad_norm": 0.0038894484786021026, "kl": 0.0518798828125, "learning_rate": 1.2789911060392295e-05, "loss": 0.0005, "num_tokens": 439468489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.46989778451606595, "frac_reward_zero_std": 1.0, "grad_norm": 0.009199161776601747, "kl": 0.05511474609375, "learning_rate": 1.2776791715853585e-05, "loss": 0.0006, "num_tokens": 440026121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4702890399569619, "frac_reward_zero_std": 1.0, "grad_norm": 0.02041999769985932, "kl": 0.063201904296875, "learning_rate": 1.2763667190628391e-05, "loss": 0.0006, "num_tokens": 440577785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4706802953978579, "frac_reward_zero_std": 1.0, "grad_norm": 0.011682622010963187, "kl": 0.05987548828125, "learning_rate": 1.275053750920327e-05, "loss": 0.0006, "num_tokens": 441130793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47107155083875385, "frac_reward_zero_std": 1.0, "grad_norm": 0.11368567534789153, "kl": 0.111572265625, "learning_rate": 1.2737402696074393e-05, "loss": 0.0011, "num_tokens": 441686217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4714628062796498, "frac_reward_zero_std": 1.0, "grad_norm": 0.007645374502474201, "kl": 0.060028076171875, "learning_rate": 1.2724262775747499e-05, "loss": 0.0006, "num_tokens": 442242201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4718540617205458, "frac_reward_zero_std": 1.0, "grad_norm": 0.010209403726872355, "kl": 0.0643310546875, "learning_rate": 1.2711117772737864e-05, "loss": 0.0006, "num_tokens": 442797017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4722453171614418, "frac_reward_zero_std": 1.0, "grad_norm": 0.010289952611777486, "kl": 0.0672607421875, "learning_rate": 1.2697967711570243e-05, "loss": 0.0007, "num_tokens": 443350537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47263657260233777, "frac_reward_zero_std": 1.0, "grad_norm": 0.011212874730842218, "kl": 0.0733642578125, "learning_rate": 1.2684812616778832e-05, "loss": 0.0007, "num_tokens": 443904553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47302782804323373, "frac_reward_zero_std": 1.0, "grad_norm": 0.012079920051519855, "kl": 0.07843017578125, "learning_rate": 1.2671652512907213e-05, "loss": 0.0008, "num_tokens": 444459993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4734190834841297, "frac_reward_zero_std": 1.0, "grad_norm": 0.01521771803966611, "kl": 0.08636474609375, "learning_rate": 1.2658487424508314e-05, "loss": 0.0009, "num_tokens": 445013961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47381033892502566, "frac_reward_zero_std": 1.0, "grad_norm": 0.014367242858218431, "kl": 0.09088134765625, "learning_rate": 1.2645317376144368e-05, "loss": 0.0009, "num_tokens": 445570073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47420159436592163, "frac_reward_zero_std": 1.0, "grad_norm": 0.020831944958931134, "kl": 0.093994140625, "learning_rate": 1.263214239238686e-05, "loss": 0.0009, "num_tokens": 446123513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47459284980681765, "frac_reward_zero_std": 1.0, "grad_norm": 0.03713704686713733, "kl": 0.10064697265625, "learning_rate": 1.261896249781647e-05, "loss": 0.001, "num_tokens": 446677577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4749841052477136, "frac_reward_zero_std": 1.0, "grad_norm": 0.02240708511705538, "kl": 0.095947265625, "learning_rate": 1.2605777717023069e-05, "loss": 0.001, "num_tokens": 447231481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4753753606886096, "frac_reward_zero_std": 1.0, "grad_norm": 0.025319263750423957, "kl": 0.09954833984375, "learning_rate": 1.2592588074605614e-05, "loss": 0.001, "num_tokens": 447783657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47576661612950555, "frac_reward_zero_std": 1.0, "grad_norm": 0.0222178098471352, "kl": 0.0966796875, "learning_rate": 1.2579393595172147e-05, "loss": 0.001, "num_tokens": 448337273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4761578715704015, "frac_reward_zero_std": 1.0, "grad_norm": 0.021667819282402127, "kl": 0.09881591796875, "learning_rate": 1.2566194303339738e-05, "loss": 0.001, "num_tokens": 448892329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4765491270112975, "frac_reward_zero_std": 1.0, "grad_norm": 0.01314276492979221, "kl": 0.08697509765625, "learning_rate": 1.2552990223734425e-05, "loss": 0.0009, "num_tokens": 449447817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4769403824521935, "frac_reward_zero_std": 1.0, "grad_norm": 0.008396802406882863, "kl": 0.07891845703125, "learning_rate": 1.2539781380991187e-05, "loss": 0.0008, "num_tokens": 450001849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47733163789308947, "frac_reward_zero_std": 1.0, "grad_norm": 0.006301351449205484, "kl": 0.072021484375, "learning_rate": 1.2526567799753883e-05, "loss": 0.0007, "num_tokens": 450553769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47772289333398543, "frac_reward_zero_std": 1.0, "grad_norm": 0.0068338789489404495, "kl": 0.0626220703125, "learning_rate": 1.251334950467522e-05, "loss": 0.0006, "num_tokens": 451109033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4781141487748814, "frac_reward_zero_std": 1.0, "grad_norm": 0.010598394249703371, "kl": 0.060760498046875, "learning_rate": 1.250012652041669e-05, "loss": 0.0006, "num_tokens": 451663417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47850540421577736, "frac_reward_zero_std": 1.0, "grad_norm": 0.0077433212761634194, "kl": 0.0538330078125, "learning_rate": 1.2486898871648552e-05, "loss": 0.0005, "num_tokens": 452217145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47889665965667333, "frac_reward_zero_std": 1.0, "grad_norm": 0.016055267169341813, "kl": 0.052978515625, "learning_rate": 1.2473666583049738e-05, "loss": 0.0005, "num_tokens": 452772681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.47928791509756935, "frac_reward_zero_std": 1.0, "grad_norm": 0.007454184248784139, "kl": 0.046356201171875, "learning_rate": 1.2460429679307863e-05, "loss": 0.0005, "num_tokens": 453327017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4796791705384653, "frac_reward_zero_std": 1.0, "grad_norm": 0.008253624609561316, "kl": 0.047027587890625, "learning_rate": 1.2447188185119143e-05, "loss": 0.0005, "num_tokens": 453881161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4800704259793613, "frac_reward_zero_std": 1.0, "grad_norm": 0.011095149484398237, "kl": 0.0467529296875, "learning_rate": 1.2433942125188359e-05, "loss": 0.0005, "num_tokens": 454435001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48046168142025725, "frac_reward_zero_std": 1.0, "grad_norm": 0.011386385003539884, "kl": 0.0472412109375, "learning_rate": 1.2420691524228804e-05, "loss": 0.0005, "num_tokens": 454988937.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4808529368611532, "frac_reward_zero_std": 1.0, "grad_norm": 0.009765985495877929, "kl": 0.04632568359375, "learning_rate": 1.240743640696226e-05, "loss": 0.0005, "num_tokens": 455542537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4812441923020492, "frac_reward_zero_std": 1.0, "grad_norm": 0.011304843867459809, "kl": 0.050140380859375, "learning_rate": 1.2394176798118914e-05, "loss": 0.0005, "num_tokens": 456096585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4816354477429452, "frac_reward_zero_std": 1.0, "grad_norm": 0.008407912983616353, "kl": 0.0501708984375, "learning_rate": 1.2380912722437348e-05, "loss": 0.0005, "num_tokens": 456652697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48202670318384117, "frac_reward_zero_std": 1.0, "grad_norm": 0.010871456040808908, "kl": 0.05560302734375, "learning_rate": 1.2367644204664468e-05, "loss": 0.0006, "num_tokens": 457207177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48241795862473713, "frac_reward_zero_std": 1.0, "grad_norm": 0.0072006317499089725, "kl": 0.0596923828125, "learning_rate": 1.2354371269555478e-05, "loss": 0.0006, "num_tokens": 457759337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4828092140656331, "frac_reward_zero_std": 1.0, "grad_norm": 0.008351546219302317, "kl": 0.06494140625, "learning_rate": 1.234109394187382e-05, "loss": 0.0006, "num_tokens": 458313785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48320046950652906, "frac_reward_zero_std": 1.0, "grad_norm": 0.0069447685939793636, "kl": 0.0679931640625, "learning_rate": 1.2327812246391124e-05, "loss": 0.0007, "num_tokens": 458866681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48359172494742503, "frac_reward_zero_std": 1.0, "grad_norm": 0.011152023298656446, "kl": 0.072509765625, "learning_rate": 1.2314526207887176e-05, "loss": 0.0007, "num_tokens": 459419577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48398298038832105, "frac_reward_zero_std": 1.0, "grad_norm": 0.013596231094425891, "kl": 0.0750732421875, "learning_rate": 1.2301235851149867e-05, "loss": 0.0008, "num_tokens": 459972793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.484374235829217, "frac_reward_zero_std": 1.0, "grad_norm": 0.0061336356514646185, "kl": 0.0782470703125, "learning_rate": 1.2287941200975135e-05, "loss": 0.0008, "num_tokens": 460528185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.484765491270113, "frac_reward_zero_std": 1.0, "grad_norm": 0.0044087559456141255, "kl": 0.078857421875, "learning_rate": 1.2274642282166935e-05, "loss": 0.0008, "num_tokens": 461079945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48515674671100895, "frac_reward_zero_std": 1.0, "grad_norm": 0.013007024320075817, "kl": 0.0841064453125, "learning_rate": 1.226133911953719e-05, "loss": 0.0008, "num_tokens": 461633849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4855480021519049, "frac_reward_zero_std": 1.0, "grad_norm": 0.2685719728982778, "kl": 0.09417724609375, "learning_rate": 1.2248031737905732e-05, "loss": 0.0009, "num_tokens": 462187417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4859392575928009, "frac_reward_zero_std": 1.0, "grad_norm": 0.010858583280014994, "kl": 0.10699462890625, "learning_rate": 1.2234720162100271e-05, "loss": 0.0011, "num_tokens": 462740489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4863305130336969, "frac_reward_zero_std": 1.0, "grad_norm": 0.01647081212007775, "kl": 0.11541748046875, "learning_rate": 1.2221404416956338e-05, "loss": 0.0012, "num_tokens": 463299305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48672176847459286, "frac_reward_zero_std": 1.0, "grad_norm": 0.008010180902903685, "kl": 0.1282958984375, "learning_rate": 1.220808452731724e-05, "loss": 0.0013, "num_tokens": 463851417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48711302391548883, "frac_reward_zero_std": 1.0, "grad_norm": 0.0490007815315267, "kl": 0.1407470703125, "learning_rate": 1.2194760518034029e-05, "loss": 0.0014, "num_tokens": 464405641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4875042793563848, "frac_reward_zero_std": 1.0, "grad_norm": 0.02838572451914095, "kl": 0.1585693359375, "learning_rate": 1.2181432413965428e-05, "loss": 0.0016, "num_tokens": 464960825.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48789553479728076, "frac_reward_zero_std": 1.0, "grad_norm": 0.02877092826250136, "kl": 0.184814453125, "learning_rate": 1.216810023997781e-05, "loss": 0.0019, "num_tokens": 465515641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4882867902381767, "frac_reward_zero_std": 1.0, "grad_norm": 0.047111728397546675, "kl": 0.2210693359375, "learning_rate": 1.2154764020945135e-05, "loss": 0.0022, "num_tokens": 466069289.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48867804567907275, "frac_reward_zero_std": 1.0, "grad_norm": 0.06066447916340524, "kl": 0.30517578125, "learning_rate": 1.2141423781748913e-05, "loss": 0.0031, "num_tokens": 466622905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4890693011199687, "frac_reward_zero_std": 1.0, "grad_norm": 0.07175783619000203, "kl": 0.41455078125, "learning_rate": 1.2128079547278158e-05, "loss": 0.0041, "num_tokens": 467176201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4894605565608647, "frac_reward_zero_std": 1.0, "grad_norm": 0.253817074382614, "kl": 0.51220703125, "learning_rate": 1.2114731342429331e-05, "loss": 0.0051, "num_tokens": 467731929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.48985181200176064, "frac_reward_zero_std": 1.0, "grad_norm": 0.2968706036759733, "kl": 0.4267578125, "learning_rate": 1.2101379192106302e-05, "loss": 0.0043, "num_tokens": 468286169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4902430674426566, "frac_reward_zero_std": 1.0, "grad_norm": 0.10101029030230778, "kl": 0.27392578125, "learning_rate": 1.2088023121220308e-05, "loss": 0.0027, "num_tokens": 468839817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4906343228835526, "frac_reward_zero_std": 1.0, "grad_norm": 0.9297971441389524, "kl": 0.63671875, "learning_rate": 1.2074663154689892e-05, "loss": 0.0064, "num_tokens": 469393465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4910255783244486, "frac_reward_zero_std": 1.0, "grad_norm": 0.2673416986248585, "kl": 0.167724609375, "learning_rate": 1.206129931744087e-05, "loss": 0.0017, "num_tokens": 469948457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49141683376534456, "frac_reward_zero_std": 1.0, "grad_norm": 0.10018883471111822, "kl": 0.1517333984375, "learning_rate": 1.204793163440628e-05, "loss": 0.0015, "num_tokens": 470501897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49180808920624053, "frac_reward_zero_std": 1.0, "grad_norm": 0.08244357925916068, "kl": 0.15478515625, "learning_rate": 1.2034560130526341e-05, "loss": 0.0015, "num_tokens": 471055337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4921993446471365, "frac_reward_zero_std": 1.0, "grad_norm": 0.05343725115194317, "kl": 0.1448974609375, "learning_rate": 1.2021184830748382e-05, "loss": 0.0014, "num_tokens": 471609657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49259060008803246, "frac_reward_zero_std": 1.0, "grad_norm": 0.025454968340622303, "kl": 0.146240234375, "learning_rate": 1.2007805760026832e-05, "loss": 0.0015, "num_tokens": 472163721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4929818555289284, "frac_reward_zero_std": 1.0, "grad_norm": 0.017216196903236072, "kl": 0.14990234375, "learning_rate": 1.199442294332315e-05, "loss": 0.0015, "num_tokens": 472717817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49337311096982445, "frac_reward_zero_std": 1.0, "grad_norm": 0.025389469449973605, "kl": 0.149658203125, "learning_rate": 1.1981036405605782e-05, "loss": 0.0015, "num_tokens": 473272553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4937643664107204, "frac_reward_zero_std": 1.0, "grad_norm": 0.04356606944124717, "kl": 0.150146484375, "learning_rate": 1.1967646171850118e-05, "loss": 0.0015, "num_tokens": 473825097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4941556218516164, "frac_reward_zero_std": 1.0, "grad_norm": 0.029695833171561197, "kl": 0.1510009765625, "learning_rate": 1.195425226703844e-05, "loss": 0.0015, "num_tokens": 474381913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49454687729251234, "frac_reward_zero_std": 1.0, "grad_norm": 0.12545197736192856, "kl": 0.1346435546875, "learning_rate": 1.194085471615989e-05, "loss": 0.0013, "num_tokens": 474935977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4949381327334083, "frac_reward_zero_std": 1.0, "grad_norm": 0.03973362460491708, "kl": 0.13916015625, "learning_rate": 1.1927453544210397e-05, "loss": 0.0014, "num_tokens": 475488457.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4953293881743043, "frac_reward_zero_std": 1.0, "grad_norm": 0.014178303147299624, "kl": 0.12774658203125, "learning_rate": 1.1914048776192657e-05, "loss": 0.0013, "num_tokens": 476041081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4957206436152003, "frac_reward_zero_std": 1.0, "grad_norm": 0.023786367692983444, "kl": 0.114990234375, "learning_rate": 1.1900640437116074e-05, "loss": 0.0011, "num_tokens": 476594297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49611189905609626, "frac_reward_zero_std": 1.0, "grad_norm": 0.6653133853933678, "kl": 0.124267578125, "learning_rate": 1.188722855199671e-05, "loss": 0.0012, "num_tokens": 477150473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4965031544969922, "frac_reward_zero_std": 1.0, "grad_norm": 0.06059581736315311, "kl": 0.1314697265625, "learning_rate": 1.187381314585725e-05, "loss": 0.0013, "num_tokens": 477704857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4968944099378882, "frac_reward_zero_std": 1.0, "grad_norm": 0.0365452654761047, "kl": 0.170654296875, "learning_rate": 1.1860394243726933e-05, "loss": 0.0017, "num_tokens": 478258905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49728566537878416, "frac_reward_zero_std": 1.0, "grad_norm": 0.058598541126379355, "kl": 0.1588134765625, "learning_rate": 1.1846971870641544e-05, "loss": 0.0016, "num_tokens": 478813961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4976769208196801, "frac_reward_zero_std": 1.0, "grad_norm": 0.4014397213325045, "kl": 0.16748046875, "learning_rate": 1.1833546051643325e-05, "loss": 0.0017, "num_tokens": 479366793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49806817626057615, "frac_reward_zero_std": 1.0, "grad_norm": 1.3146173344215974, "kl": 0.2198486328125, "learning_rate": 1.182011681178095e-05, "loss": 0.0022, "num_tokens": 479920265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4984594317014721, "frac_reward_zero_std": 1.0, "grad_norm": 0.07400969914623746, "kl": 0.1290283203125, "learning_rate": 1.1806684176109486e-05, "loss": 0.0013, "num_tokens": 480474729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.4988506871423681, "frac_reward_zero_std": 1.0, "grad_norm": 0.04001819700218596, "kl": 0.10809326171875, "learning_rate": 1.1793248169690319e-05, "loss": 0.0011, "num_tokens": 481029865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49924194258326404, "frac_reward_zero_std": 1.0, "grad_norm": 0.6386587383520783, "kl": 0.2325439453125, "learning_rate": 1.1779808817591143e-05, "loss": 0.0023, "num_tokens": 481582425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.49963319802416, "frac_reward_zero_std": 1.0, "grad_norm": 0.04679333864397206, "kl": 0.07342529296875, "learning_rate": 1.1766366144885877e-05, "loss": 0.0007, "num_tokens": 482137673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.500024453465056, "frac_reward_zero_std": 1.0, "grad_norm": 0.011140753213420833, "kl": 0.0850830078125, "learning_rate": 1.175292017665464e-05, "loss": 0.0009, "num_tokens": 482691273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.500415708905952, "frac_reward_zero_std": 1.0, "grad_norm": 0.019350875735652303, "kl": 0.07525634765625, "learning_rate": 1.1739470937983708e-05, "loss": 0.0008, "num_tokens": 483246169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5008069643468479, "frac_reward_zero_std": 1.0, "grad_norm": 0.036255969819245226, "kl": 0.060272216796875, "learning_rate": 1.1726018453965452e-05, "loss": 0.0006, "num_tokens": 483800793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5011982197877439, "frac_reward_zero_std": 1.0, "grad_norm": 0.01126654571596688, "kl": 0.054290771484375, "learning_rate": 1.171256274969829e-05, "loss": 0.0005, "num_tokens": 484353993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.50158947522864, "frac_reward_zero_std": 1.0, "grad_norm": 0.025347632354996638, "kl": 0.0452880859375, "learning_rate": 1.1699103850286668e-05, "loss": 0.0005, "num_tokens": 484907513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5019807306695359, "frac_reward_zero_std": 1.0, "grad_norm": 0.029681997018109106, "kl": 0.04022216796875, "learning_rate": 1.1685641780840972e-05, "loss": 0.0004, "num_tokens": 485460105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5023719861104319, "frac_reward_zero_std": 1.0, "grad_norm": 0.00870802707618321, "kl": 0.037872314453125, "learning_rate": 1.167217656647752e-05, "loss": 0.0004, "num_tokens": 486015561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5027632415513278, "frac_reward_zero_std": 1.0, "grad_norm": 0.007383165975046298, "kl": 0.03497314453125, "learning_rate": 1.1658708232318484e-05, "loss": 0.0003, "num_tokens": 486569881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5031544969922238, "frac_reward_zero_std": 1.0, "grad_norm": 0.004120010163711365, "kl": 0.035308837890625, "learning_rate": 1.1645236803491859e-05, "loss": 0.0004, "num_tokens": 487123337.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5035457524331197, "frac_reward_zero_std": 1.0, "grad_norm": 0.003710105269728874, "kl": 0.032318115234375, "learning_rate": 1.1631762305131424e-05, "loss": 0.0003, "num_tokens": 487678521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5039370078740157, "frac_reward_zero_std": 1.0, "grad_norm": 0.003804763313088028, "kl": 0.03363037109375, "learning_rate": 1.1618284762376674e-05, "loss": 0.0003, "num_tokens": 488233673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5043282633149118, "frac_reward_zero_std": 1.0, "grad_norm": 0.00453851588672677, "kl": 0.032257080078125, "learning_rate": 1.1604804200372786e-05, "loss": 0.0003, "num_tokens": 488788201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5047195187558077, "frac_reward_zero_std": 1.0, "grad_norm": 0.006267344575560191, "kl": 0.0319976806640625, "learning_rate": 1.1591320644270575e-05, "loss": 0.0003, "num_tokens": 489344425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5051107741967037, "frac_reward_zero_std": 1.0, "grad_norm": 0.01293765861617522, "kl": 0.03253173828125, "learning_rate": 1.1577834119226439e-05, "loss": 0.0003, "num_tokens": 489897497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5055020296375996, "frac_reward_zero_std": 1.0, "grad_norm": 0.002395866912914017, "kl": 0.030364990234375, "learning_rate": 1.156434465040231e-05, "loss": 0.0003, "num_tokens": 490449241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5058932850784956, "frac_reward_zero_std": 1.0, "grad_norm": 0.00858746064185837, "kl": 0.0325775146484375, "learning_rate": 1.1550852262965622e-05, "loss": 0.0003, "num_tokens": 491002441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5062845405193916, "frac_reward_zero_std": 1.0, "grad_norm": 0.004726531241097891, "kl": 0.029449462890625, "learning_rate": 1.1537356982089247e-05, "loss": 0.0003, "num_tokens": 491555865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5066757959602876, "frac_reward_zero_std": 1.0, "grad_norm": 0.009225714156959362, "kl": 0.0300445556640625, "learning_rate": 1.152385883295146e-05, "loss": 0.0003, "num_tokens": 492107257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5070670514011836, "frac_reward_zero_std": 1.0, "grad_norm": 0.020075861233202794, "kl": 0.030731201171875, "learning_rate": 1.1510357840735884e-05, "loss": 0.0003, "num_tokens": 492660665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5074583068420795, "frac_reward_zero_std": 1.0, "grad_norm": 0.007569198479228668, "kl": 0.03363037109375, "learning_rate": 1.1496854030631443e-05, "loss": 0.0003, "num_tokens": 493213673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5078495622829755, "frac_reward_zero_std": 1.0, "grad_norm": 0.006124629332560575, "kl": 0.030548095703125, "learning_rate": 1.148334742783233e-05, "loss": 0.0003, "num_tokens": 493767545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5082408177238714, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015199408838556777, "kl": 0.0301513671875, "learning_rate": 1.146983805753794e-05, "loss": 0.0003, "num_tokens": 494321961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5086320731647674, "frac_reward_zero_std": 1.0, "grad_norm": 0.006042606329064069, "kl": 0.0300445556640625, "learning_rate": 1.1456325944952827e-05, "loss": 0.0003, "num_tokens": 494876377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5090233286056635, "frac_reward_zero_std": 1.0, "grad_norm": 0.003717666114950879, "kl": 0.031280517578125, "learning_rate": 1.144281111528667e-05, "loss": 0.0003, "num_tokens": 495430777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5094145840465594, "frac_reward_zero_std": 1.0, "grad_norm": 0.002458481177825074, "kl": 0.0297393798828125, "learning_rate": 1.1429293593754216e-05, "loss": 0.0003, "num_tokens": 495985785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5098058394874554, "frac_reward_zero_std": 1.0, "grad_norm": 0.0017594150395619469, "kl": 0.029144287109375, "learning_rate": 1.1415773405575233e-05, "loss": 0.0003, "num_tokens": 496540089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5101970949283513, "frac_reward_zero_std": 1.0, "grad_norm": 0.04495904615624312, "kl": 0.03802490234375, "learning_rate": 1.1402250575974456e-05, "loss": 0.0004, "num_tokens": 497094201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5105883503692473, "frac_reward_zero_std": 1.0, "grad_norm": 0.00620283067661602, "kl": 0.03289794921875, "learning_rate": 1.1388725130181566e-05, "loss": 0.0003, "num_tokens": 497649097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5109796058101433, "frac_reward_zero_std": 1.0, "grad_norm": 0.0015544143559623844, "kl": 0.0325927734375, "learning_rate": 1.1375197093431108e-05, "loss": 0.0003, "num_tokens": 498201561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5113708612510393, "frac_reward_zero_std": 1.0, "grad_norm": 0.011791694643137873, "kl": 0.039093017578125, "learning_rate": 1.1361666490962468e-05, "loss": 0.0004, "num_tokens": 498754697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5117621166919353, "frac_reward_zero_std": 1.0, "grad_norm": 0.00433620463537868, "kl": 0.03741455078125, "learning_rate": 1.1348133348019822e-05, "loss": 0.0004, "num_tokens": 499308697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5121533721328312, "frac_reward_zero_std": 1.0, "grad_norm": 0.01412578291592083, "kl": 0.04302978515625, "learning_rate": 1.1334597689852076e-05, "loss": 0.0004, "num_tokens": 499862969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5125446275737272, "frac_reward_zero_std": 1.0, "grad_norm": 0.010623145971307913, "kl": 0.03936767578125, "learning_rate": 1.1321059541712844e-05, "loss": 0.0004, "num_tokens": 500416569.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5129358830146231, "frac_reward_zero_std": 1.0, "grad_norm": 0.00793938487696846, "kl": 0.04486083984375, "learning_rate": 1.1307518928860371e-05, "loss": 0.0004, "num_tokens": 500969689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5133271384555191, "frac_reward_zero_std": 1.0, "grad_norm": 0.005230525889971785, "kl": 0.043426513671875, "learning_rate": 1.1293975876557506e-05, "loss": 0.0004, "num_tokens": 501525241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5137183938964152, "frac_reward_zero_std": 1.0, "grad_norm": 0.020155089084848775, "kl": 0.050048828125, "learning_rate": 1.1280430410071652e-05, "loss": 0.0005, "num_tokens": 502078521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5141096493373111, "frac_reward_zero_std": 1.0, "grad_norm": 0.003586576749905112, "kl": 0.04742431640625, "learning_rate": 1.1266882554674709e-05, "loss": 0.0005, "num_tokens": 502631017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5145009047782071, "frac_reward_zero_std": 1.0, "grad_norm": 0.08447255888756033, "kl": 0.053070068359375, "learning_rate": 1.1253332335643043e-05, "loss": 0.0005, "num_tokens": 503183593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.514892160219103, "frac_reward_zero_std": 1.0, "grad_norm": 0.00916749108731875, "kl": 0.05517578125, "learning_rate": 1.1239779778257424e-05, "loss": 0.0006, "num_tokens": 503737161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.515283415659999, "frac_reward_zero_std": 1.0, "grad_norm": 0.09888826488804488, "kl": 0.06671142578125, "learning_rate": 1.1226224907802986e-05, "loss": 0.0007, "num_tokens": 504289193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.515674671100895, "frac_reward_zero_std": 1.0, "grad_norm": 2.6450566626476326, "kl": 0.115478515625, "learning_rate": 1.1212667749569179e-05, "loss": 0.0012, "num_tokens": 504843593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.516065926541791, "frac_reward_zero_std": 1.0, "grad_norm": 0.014062114722087316, "kl": 0.060821533203125, "learning_rate": 1.119910832884972e-05, "loss": 0.0006, "num_tokens": 505397945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.516457181982687, "frac_reward_zero_std": 1.0, "grad_norm": 0.004597621191960287, "kl": 0.0665283203125, "learning_rate": 1.1185546670942547e-05, "loss": 0.0007, "num_tokens": 505951993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5168484374235829, "frac_reward_zero_std": 1.0, "grad_norm": 0.0056181372166502624, "kl": 0.07623291015625, "learning_rate": 1.1171982801149774e-05, "loss": 0.0008, "num_tokens": 506504953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5172396928644789, "frac_reward_zero_std": 1.0, "grad_norm": 0.03895912448493468, "kl": 0.0902099609375, "learning_rate": 1.1158416744777644e-05, "loss": 0.0009, "num_tokens": 507058025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5176309483053748, "frac_reward_zero_std": 1.0, "grad_norm": 0.11336620076816525, "kl": 0.08551025390625, "learning_rate": 1.1144848527136472e-05, "loss": 0.0009, "num_tokens": 507610969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5180222037462708, "frac_reward_zero_std": 1.0, "grad_norm": 0.03189882271301583, "kl": 0.08807373046875, "learning_rate": 1.113127817354061e-05, "loss": 0.0009, "num_tokens": 508163737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5184134591871669, "frac_reward_zero_std": 1.0, "grad_norm": 0.007546039663686281, "kl": 0.07623291015625, "learning_rate": 1.1117705709308394e-05, "loss": 0.0008, "num_tokens": 508720345.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5188047146280628, "frac_reward_zero_std": 1.0, "grad_norm": 26.468644734294895, "kl": 0.24346923828125, "learning_rate": 1.1104131159762104e-05, "loss": 0.0024, "num_tokens": 509275081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5191959700689588, "frac_reward_zero_std": 1.0, "grad_norm": 0.012382759703187588, "kl": 0.09063720703125, "learning_rate": 1.1090554550227899e-05, "loss": 0.0009, "num_tokens": 509828633.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5195872255098547, "frac_reward_zero_std": 1.0, "grad_norm": 0.011253951469088457, "kl": 0.0838623046875, "learning_rate": 1.107697590603579e-05, "loss": 0.0008, "num_tokens": 510382281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5199784809507507, "frac_reward_zero_std": 1.0, "grad_norm": 0.013926501356236876, "kl": 0.09710693359375, "learning_rate": 1.106339525251958e-05, "loss": 0.001, "num_tokens": 510937449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5203697363916467, "frac_reward_zero_std": 1.0, "grad_norm": 0.01092772392912622, "kl": 0.08441162109375, "learning_rate": 1.1049812615016823e-05, "loss": 0.0008, "num_tokens": 511491257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5207609918325427, "frac_reward_zero_std": 1.0, "grad_norm": 0.012418332952055777, "kl": 0.09442138671875, "learning_rate": 1.1036228018868775e-05, "loss": 0.0009, "num_tokens": 512044777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5211522472734387, "frac_reward_zero_std": 1.0, "grad_norm": 0.009270857681101221, "kl": 0.08135986328125, "learning_rate": 1.1022641489420342e-05, "loss": 0.0008, "num_tokens": 512600057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5215435027143346, "frac_reward_zero_std": 1.0, "grad_norm": 0.02643950783129231, "kl": 0.08428955078125, "learning_rate": 1.1009053052020046e-05, "loss": 0.0008, "num_tokens": 513155769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5219347581552306, "frac_reward_zero_std": 1.0, "grad_norm": 0.11989705795903795, "kl": 0.07763671875, "learning_rate": 1.0995462732019957e-05, "loss": 0.0008, "num_tokens": 513711417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5223260135961266, "frac_reward_zero_std": 1.0, "grad_norm": 0.007058068125754561, "kl": 0.06842041015625, "learning_rate": 1.0981870554775664e-05, "loss": 0.0007, "num_tokens": 514266441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5227172690370225, "frac_reward_zero_std": 1.0, "grad_norm": 0.004518615898128577, "kl": 0.065032958984375, "learning_rate": 1.0968276545646223e-05, "loss": 0.0007, "num_tokens": 514821017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5231085244779186, "frac_reward_zero_std": 1.0, "grad_norm": 0.003898350359503188, "kl": 0.063018798828125, "learning_rate": 1.0954680729994103e-05, "loss": 0.0006, "num_tokens": 515375657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5234997799188145, "frac_reward_zero_std": 1.0, "grad_norm": 0.09644190380055712, "kl": 0.064666748046875, "learning_rate": 1.0941083133185146e-05, "loss": 0.0006, "num_tokens": 515929977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5238910353597105, "frac_reward_zero_std": 1.0, "grad_norm": 0.003391833069671105, "kl": 0.05743408203125, "learning_rate": 1.0927483780588515e-05, "loss": 0.0006, "num_tokens": 516483481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5242822908006064, "frac_reward_zero_std": 1.0, "grad_norm": 0.03059512495128652, "kl": 0.05999755859375, "learning_rate": 1.091388269757665e-05, "loss": 0.0006, "num_tokens": 517037849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5246735462415024, "frac_reward_zero_std": 1.0, "grad_norm": 0.01945494544054731, "kl": 0.05908203125, "learning_rate": 1.0900279909525226e-05, "loss": 0.0006, "num_tokens": 517591145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5250648016823984, "frac_reward_zero_std": 1.0, "grad_norm": 0.012030208027465907, "kl": 0.055755615234375, "learning_rate": 1.0886675441813083e-05, "loss": 0.0006, "num_tokens": 518144489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5254560571232944, "frac_reward_zero_std": 1.0, "grad_norm": 0.0052458436723941005, "kl": 0.060791015625, "learning_rate": 1.0873069319822212e-05, "loss": 0.0006, "num_tokens": 518698521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5258473125641904, "frac_reward_zero_std": 1.0, "grad_norm": 0.0027884909820102967, "kl": 0.060791015625, "learning_rate": 1.0859461568937682e-05, "loss": 0.0006, "num_tokens": 519251657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5262385680050863, "frac_reward_zero_std": 1.0, "grad_norm": 0.10282976338870048, "kl": 0.064483642578125, "learning_rate": 1.08458522145476e-05, "loss": 0.0006, "num_tokens": 519805353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5266298234459823, "frac_reward_zero_std": 1.0, "grad_norm": 0.006496380248196584, "kl": 0.0684814453125, "learning_rate": 1.0832241282043067e-05, "loss": 0.0007, "num_tokens": 520358137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5270210788868783, "frac_reward_zero_std": 1.0, "grad_norm": 0.7289406685667376, "kl": 0.34698486328125, "learning_rate": 1.0818628796818134e-05, "loss": 0.0035, "num_tokens": 520910841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5274123343277742, "frac_reward_zero_std": 1.0, "grad_norm": 0.6657775771779028, "kl": 0.2734375, "learning_rate": 1.0805014784269734e-05, "loss": 0.0027, "num_tokens": 521463641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5278035897686703, "frac_reward_zero_std": 1.0, "grad_norm": 0.9229720423367794, "kl": 0.2763671875, "learning_rate": 1.079139926979766e-05, "loss": 0.0028, "num_tokens": 522017433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5281948452095662, "frac_reward_zero_std": 1.0, "grad_norm": 10.252635151641952, "kl": 0.590087890625, "learning_rate": 1.0777782278804511e-05, "loss": 0.0059, "num_tokens": 522571017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5285861006504622, "frac_reward_zero_std": 1.0, "grad_norm": 12754.875514502834, "kl": 116.677734375, "learning_rate": 1.0764163836695632e-05, "loss": 1.1724, "num_tokens": 523125353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5289773560913581, "frac_reward_zero_std": 1.0, "grad_norm": 154.93083969735773, "kl": 2.787109375, "learning_rate": 1.0750543968879081e-05, "loss": 0.0279, "num_tokens": 523678585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5293686115322541, "frac_reward_zero_std": 1.0, "grad_norm": 0.12525192058925966, "kl": 0.6181640625, "learning_rate": 1.073692270076557e-05, "loss": 0.0062, "num_tokens": 524231881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5297598669731501, "frac_reward_zero_std": 1.0, "grad_norm": 0.3264233482431127, "kl": 0.76220703125, "learning_rate": 1.072330005776843e-05, "loss": 0.0076, "num_tokens": 524786009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.530151122414046, "frac_reward_zero_std": 1.0, "grad_norm": 0.5013932935385996, "kl": 0.87841796875, "learning_rate": 1.0709676065303556e-05, "loss": 0.0088, "num_tokens": 525341113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5305423778549421, "frac_reward_zero_std": 1.0, "grad_norm": 0.48923138422463625, "kl": 0.87353515625, "learning_rate": 1.0696050748789357e-05, "loss": 0.0087, "num_tokens": 525896553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.530933633295838, "frac_reward_zero_std": 1.0, "grad_norm": 11.34203671162093, "kl": 1.1337890625, "learning_rate": 1.0682424133646712e-05, "loss": 0.0113, "num_tokens": 526448041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.531324888736734, "frac_reward_zero_std": 1.0, "grad_norm": 0.7574698339129471, "kl": 0.9091796875, "learning_rate": 1.066879624529893e-05, "loss": 0.0091, "num_tokens": 527002137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.53171614417763, "frac_reward_zero_std": 1.0, "grad_norm": 3.899640881779135, "kl": 0.99951171875, "learning_rate": 1.0655167109171685e-05, "loss": 0.01, "num_tokens": 527558297.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5321073996185259, "frac_reward_zero_std": 1.0, "grad_norm": 19114.603983447327, "kl": 98.384765625, "learning_rate": 1.0641536750692993e-05, "loss": 0.9866, "num_tokens": 528117513.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.532498655059422, "frac_reward_zero_std": 1.0, "grad_norm": 2028.1218716118892, "kl": 17.361328125, "learning_rate": 1.0627905195293135e-05, "loss": 0.1737, "num_tokens": 528673241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5328899105003179, "frac_reward_zero_std": 1.0, "grad_norm": 230.21627405730356, "kl": 4.65234375, "learning_rate": 1.0614272468404637e-05, "loss": 0.0466, "num_tokens": 529228057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5332811659412139, "frac_reward_zero_std": 1.0, "grad_norm": 26.9687481762613, "kl": 1.896484375, "learning_rate": 1.0600638595462202e-05, "loss": 0.019, "num_tokens": 529785033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5336724213821098, "frac_reward_zero_std": 1.0, "grad_norm": 98.19836728160763, "kl": 1.6015625, "learning_rate": 1.0587003601902682e-05, "loss": 0.016, "num_tokens": 530338793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5340636768230058, "frac_reward_zero_std": 1.0, "grad_norm": 61.592411239227914, "kl": 1.5732421875, "learning_rate": 1.0573367513165003e-05, "loss": 0.0157, "num_tokens": 530895465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5344549322639018, "frac_reward_zero_std": 1.0, "grad_norm": 2.342385916264425, "kl": 1.2890625, "learning_rate": 1.055973035469015e-05, "loss": 0.0129, "num_tokens": 531447529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5348461877047977, "frac_reward_zero_std": 1.0, "grad_norm": 66.05230711540815, "kl": 1.923828125, "learning_rate": 1.05460921519211e-05, "loss": 0.0192, "num_tokens": 532001721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5352374431456938, "frac_reward_zero_std": 1.0, "grad_norm": 0.4834180358540301, "kl": 1.111328125, "learning_rate": 1.0532452930302778e-05, "loss": 0.0111, "num_tokens": 532553849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5356286985865897, "frac_reward_zero_std": 1.0, "grad_norm": 0.3797075486605677, "kl": 0.97705078125, "learning_rate": 1.0518812715282001e-05, "loss": 0.0098, "num_tokens": 533106729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5360199540274857, "frac_reward_zero_std": 1.0, "grad_norm": 0.3227293130718177, "kl": 0.8896484375, "learning_rate": 1.0505171532307447e-05, "loss": 0.0089, "num_tokens": 533659193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5364112094683817, "frac_reward_zero_std": 1.0, "grad_norm": 0.2692557947791634, "kl": 0.83056640625, "learning_rate": 1.0491529406829608e-05, "loss": 0.0083, "num_tokens": 534211865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5368024649092776, "frac_reward_zero_std": 1.0, "grad_norm": 0.214249976193059, "kl": 0.810546875, "learning_rate": 1.0477886364300722e-05, "loss": 0.0081, "num_tokens": 534764969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5371937203501737, "frac_reward_zero_std": 1.0, "grad_norm": 0.23298869312356454, "kl": 0.71337890625, "learning_rate": 1.0464242430174737e-05, "loss": 0.0071, "num_tokens": 535315385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5375849757910696, "frac_reward_zero_std": 1.0, "grad_norm": 0.2180374640879868, "kl": 0.6533203125, "learning_rate": 1.0450597629907276e-05, "loss": 0.0065, "num_tokens": 535868041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5379762312319656, "frac_reward_zero_std": 1.0, "grad_norm": 0.3023417283387628, "kl": 0.5673828125, "learning_rate": 1.043695198895557e-05, "loss": 0.0057, "num_tokens": 536421849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5383674866728615, "frac_reward_zero_std": 1.0, "grad_norm": 0.22033738852078097, "kl": 0.53662109375, "learning_rate": 1.0423305532778419e-05, "loss": 0.0054, "num_tokens": 536974121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5387587421137575, "frac_reward_zero_std": 1.0, "grad_norm": 254.43754897640036, "kl": 2.88134765625, "learning_rate": 1.0409658286836144e-05, "loss": 0.0289, "num_tokens": 537529737.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5391499975546535, "frac_reward_zero_std": 1.0, "grad_norm": 1.17935823186478, "kl": 0.7626953125, "learning_rate": 1.0396010276590543e-05, "loss": 0.0076, "num_tokens": 538085033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5395412529955494, "frac_reward_zero_std": 1.0, "grad_norm": 3.552637190621947, "kl": 0.53759765625, "learning_rate": 1.0382361527504836e-05, "loss": 0.0054, "num_tokens": 538639977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5399325084364455, "frac_reward_zero_std": 1.0, "grad_norm": 1.490930837854134, "kl": 0.5341796875, "learning_rate": 1.036871206504362e-05, "loss": 0.0053, "num_tokens": 539192585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5403237638773414, "frac_reward_zero_std": 1.0, "grad_norm": 0.301483343607019, "kl": 0.58251953125, "learning_rate": 1.0355061914672831e-05, "loss": 0.0058, "num_tokens": 539744841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5407150193182374, "frac_reward_zero_std": 1.0, "grad_norm": 0.4724877782182411, "kl": 0.7080078125, "learning_rate": 1.034141110185968e-05, "loss": 0.0071, "num_tokens": 540297097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5411062747591334, "frac_reward_zero_std": 1.0, "grad_norm": 0.3411437230300974, "kl": 0.62939453125, "learning_rate": 1.0327759652072618e-05, "loss": 0.0063, "num_tokens": 540851465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5414975302000293, "frac_reward_zero_std": 1.0, "grad_norm": 0.1772538803910055, "kl": 0.511962890625, "learning_rate": 1.0314107590781284e-05, "loss": 0.0051, "num_tokens": 541407209.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5418887856409254, "frac_reward_zero_std": 1.0, "grad_norm": 0.24725920023864267, "kl": 0.470458984375, "learning_rate": 1.0300454943456457e-05, "loss": 0.0047, "num_tokens": 541960073.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5422800410818213, "frac_reward_zero_std": 1.0, "grad_norm": 0.38461693469409924, "kl": 0.467529296875, "learning_rate": 1.0286801735570009e-05, "loss": 0.0047, "num_tokens": 542513161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5426712965227173, "frac_reward_zero_std": 1.0, "grad_norm": 0.15238480011610625, "kl": 0.467529296875, "learning_rate": 1.0273147992594861e-05, "loss": 0.0047, "num_tokens": 543069865.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5430625519636132, "frac_reward_zero_std": 1.0, "grad_norm": 0.27349907223861175, "kl": 0.473876953125, "learning_rate": 1.0259493740004927e-05, "loss": 0.0047, "num_tokens": 543622329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5434538074045092, "frac_reward_zero_std": 1.0, "grad_norm": 0.2535425139530763, "kl": 0.443115234375, "learning_rate": 1.0245839003275076e-05, "loss": 0.0044, "num_tokens": 544174617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5438450628454052, "frac_reward_zero_std": 1.0, "grad_norm": 0.13594539208019726, "kl": 0.381591796875, "learning_rate": 1.023218380788108e-05, "loss": 0.0038, "num_tokens": 544730361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5442363182863011, "frac_reward_zero_std": 1.0, "grad_norm": 0.19694991838515208, "kl": 0.35400390625, "learning_rate": 1.0218528179299562e-05, "loss": 0.0035, "num_tokens": 545284009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5446275737271972, "frac_reward_zero_std": 1.0, "grad_norm": 0.2178571397509956, "kl": 0.31005859375, "learning_rate": 1.0204872143007965e-05, "loss": 0.0031, "num_tokens": 545839545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5450188291680931, "frac_reward_zero_std": 1.0, "grad_norm": 0.08402669158068525, "kl": 0.26416015625, "learning_rate": 1.0191215724484476e-05, "loss": 0.0026, "num_tokens": 546393945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5454100846089891, "frac_reward_zero_std": 1.0, "grad_norm": 0.10877111578721503, "kl": 0.2362060546875, "learning_rate": 1.0177558949208008e-05, "loss": 0.0024, "num_tokens": 546946793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5458013400498851, "frac_reward_zero_std": 1.0, "grad_norm": 0.11452533583588441, "kl": 0.2105712890625, "learning_rate": 1.0163901842658134e-05, "loss": 0.0021, "num_tokens": 547499769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.546192595490781, "frac_reward_zero_std": 1.0, "grad_norm": 0.07831031852037874, "kl": 0.1822509765625, "learning_rate": 1.0150244430315048e-05, "loss": 0.0018, "num_tokens": 548054729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.546583850931677, "frac_reward_zero_std": 1.0, "grad_norm": 0.11448969936878485, "kl": 0.17333984375, "learning_rate": 1.013658673765951e-05, "loss": 0.0017, "num_tokens": 548610617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.546975106372573, "frac_reward_zero_std": 1.0, "grad_norm": 0.058040189690023385, "kl": 0.15673828125, "learning_rate": 1.0122928790172814e-05, "loss": 0.0016, "num_tokens": 549163753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.547366361813469, "frac_reward_zero_std": 1.0, "grad_norm": 0.11717371714223183, "kl": 0.1614990234375, "learning_rate": 1.010927061333671e-05, "loss": 0.0016, "num_tokens": 549717561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5477576172543649, "frac_reward_zero_std": 1.0, "grad_norm": 0.1070229571771743, "kl": 0.1544189453125, "learning_rate": 1.0095612232633394e-05, "loss": 0.0015, "num_tokens": 550272841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5481488726952609, "frac_reward_zero_std": 1.0, "grad_norm": 0.11426710382638015, "kl": 0.15673828125, "learning_rate": 1.0081953673545432e-05, "loss": 0.0016, "num_tokens": 550828585.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5485401281361569, "frac_reward_zero_std": 1.0, "grad_norm": 0.08020842108394216, "kl": 0.1485595703125, "learning_rate": 1.0068294961555734e-05, "loss": 0.0015, "num_tokens": 551383065.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5489313835770528, "frac_reward_zero_std": 1.0, "grad_norm": 0.06310561462909509, "kl": 0.1429443359375, "learning_rate": 1.0054636122147481e-05, "loss": 0.0014, "num_tokens": 551937433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5493226390179489, "frac_reward_zero_std": 1.0, "grad_norm": 0.056073758020919094, "kl": 0.1407470703125, "learning_rate": 1.0040977180804097e-05, "loss": 0.0014, "num_tokens": 552491705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5497138944588448, "frac_reward_zero_std": 1.0, "grad_norm": 0.3585161291053175, "kl": 0.186279296875, "learning_rate": 1.0027318163009201e-05, "loss": 0.0019, "num_tokens": 553046057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5501051498997408, "frac_reward_zero_std": 1.0, "grad_norm": 0.05009219005181626, "kl": 0.1490478515625, "learning_rate": 1.0013659094246552e-05, "loss": 0.0015, "num_tokens": 553599433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5504964053406368, "frac_reward_zero_std": 1.0, "grad_norm": 0.05304129591781501, "kl": 0.15234375, "learning_rate": 1e-05, "loss": 0.0015, "num_tokens": 554154793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5508876607815327, "frac_reward_zero_std": 1.0, "grad_norm": 0.12048258533172668, "kl": 0.1676025390625, "learning_rate": 9.98634090575345e-06, "loss": 0.0017, "num_tokens": 554708617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5512789162224287, "frac_reward_zero_std": 1.0, "grad_norm": 0.05804925487363815, "kl": 0.1634521484375, "learning_rate": 9.9726818369908e-06, "loss": 0.0016, "num_tokens": 555263401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5516701716633247, "frac_reward_zero_std": 1.0, "grad_norm": 0.13071039991326672, "kl": 0.1661376953125, "learning_rate": 9.95902281919591e-06, "loss": 0.0017, "num_tokens": 555816905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5520614271042207, "frac_reward_zero_std": 1.0, "grad_norm": 0.07217882771337868, "kl": 0.182373046875, "learning_rate": 9.945363877852526e-06, "loss": 0.0018, "num_tokens": 556372169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5524526825451166, "frac_reward_zero_std": 1.0, "grad_norm": 0.23852138039012094, "kl": 0.1988525390625, "learning_rate": 9.93170503844427e-06, "loss": 0.002, "num_tokens": 556927449.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5528439379860126, "frac_reward_zero_std": 1.0, "grad_norm": 0.17341316884526023, "kl": 0.2001953125, "learning_rate": 9.91804632645457e-06, "loss": 0.002, "num_tokens": 557481113.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5532351934269086, "frac_reward_zero_std": 1.0, "grad_norm": 0.09745766953261306, "kl": 0.2049560546875, "learning_rate": 9.904387767366607e-06, "loss": 0.0021, "num_tokens": 558036537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5536264488678045, "frac_reward_zero_std": 1.0, "grad_norm": 0.17776870547319018, "kl": 0.2252197265625, "learning_rate": 9.890729386663291e-06, "loss": 0.0022, "num_tokens": 558590201.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5540177043087006, "frac_reward_zero_std": 1.0, "grad_norm": 0.1650567806212086, "kl": 0.2310791015625, "learning_rate": 9.877071209827191e-06, "loss": 0.0023, "num_tokens": 559146377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5544089597495965, "frac_reward_zero_std": 1.0, "grad_norm": 0.12458408554469602, "kl": 0.24560546875, "learning_rate": 9.863413262340491e-06, "loss": 0.0025, "num_tokens": 559699529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5548002151904925, "frac_reward_zero_std": 1.0, "grad_norm": 0.07716665838920143, "kl": 0.2518310546875, "learning_rate": 9.849755569684955e-06, "loss": 0.0025, "num_tokens": 560253097.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5551914706313885, "frac_reward_zero_std": 1.0, "grad_norm": 0.09685681380664075, "kl": 0.27392578125, "learning_rate": 9.836098157341867e-06, "loss": 0.0027, "num_tokens": 560809929.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5555827260722844, "frac_reward_zero_std": 1.0, "grad_norm": 0.12172916447891624, "kl": 0.283935546875, "learning_rate": 9.822441050791995e-06, "loss": 0.0028, "num_tokens": 561365033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5559739815131804, "frac_reward_zero_std": 1.0, "grad_norm": 0.10324710110468181, "kl": 0.295166015625, "learning_rate": 9.808784275515526e-06, "loss": 0.0029, "num_tokens": 561917241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5563652369540764, "frac_reward_zero_std": 1.0, "grad_norm": 0.1061348240866679, "kl": 0.302490234375, "learning_rate": 9.79512785699204e-06, "loss": 0.003, "num_tokens": 562469081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5567564923949724, "frac_reward_zero_std": 1.0, "grad_norm": 0.11637390188349031, "kl": 0.310546875, "learning_rate": 9.78147182070044e-06, "loss": 0.0031, "num_tokens": 563020873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5571477478358683, "frac_reward_zero_std": 1.0, "grad_norm": 0.1225167487401071, "kl": 0.322265625, "learning_rate": 9.767816192118923e-06, "loss": 0.0032, "num_tokens": 563574009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5575390032767643, "frac_reward_zero_std": 1.0, "grad_norm": 0.08576847062952504, "kl": 0.34326171875, "learning_rate": 9.754160996724927e-06, "loss": 0.0034, "num_tokens": 564128329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5579302587176603, "frac_reward_zero_std": 1.0, "grad_norm": 0.17011256224145008, "kl": 0.405029296875, "learning_rate": 9.740506259995075e-06, "loss": 0.0041, "num_tokens": 564682489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5583215141585562, "frac_reward_zero_std": 1.0, "grad_norm": 0.1621906705264459, "kl": 0.404052734375, "learning_rate": 9.726852007405144e-06, "loss": 0.004, "num_tokens": 565238905.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5587127695994523, "frac_reward_zero_std": 1.0, "grad_norm": 0.13952722725320527, "kl": 0.416015625, "learning_rate": 9.713198264429993e-06, "loss": 0.0042, "num_tokens": 565792265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5591040250403482, "frac_reward_zero_std": 1.0, "grad_norm": 0.12208882441678509, "kl": 0.42138671875, "learning_rate": 9.699545056543546e-06, "loss": 0.0042, "num_tokens": 566346857.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5594952804812442, "frac_reward_zero_std": 1.0, "grad_norm": 0.10152047588138996, "kl": 0.413818359375, "learning_rate": 9.685892409218718e-06, "loss": 0.0041, "num_tokens": 566899369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5598865359221402, "frac_reward_zero_std": 1.0, "grad_norm": 0.0781569524538884, "kl": 0.421875, "learning_rate": 9.672240347927382e-06, "loss": 0.0042, "num_tokens": 567451961.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5602777913630361, "frac_reward_zero_std": 1.0, "grad_norm": 0.08857648056693726, "kl": 0.427734375, "learning_rate": 9.658588898140322e-06, "loss": 0.0043, "num_tokens": 568006873.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5606690468039321, "frac_reward_zero_std": 1.0, "grad_norm": 0.11151474449762412, "kl": 0.433837890625, "learning_rate": 9.644938085327174e-06, "loss": 0.0043, "num_tokens": 568561593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5610603022448281, "frac_reward_zero_std": 1.0, "grad_norm": 0.130751137970137, "kl": 0.438720703125, "learning_rate": 9.631287934956383e-06, "loss": 0.0044, "num_tokens": 569115161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5614515576857241, "frac_reward_zero_std": 1.0, "grad_norm": 0.09121086893595468, "kl": 0.43408203125, "learning_rate": 9.617638472495168e-06, "loss": 0.0043, "num_tokens": 569667433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.56184281312662, "frac_reward_zero_std": 1.0, "grad_norm": 0.07918289298674214, "kl": 0.438232421875, "learning_rate": 9.60398972340946e-06, "loss": 0.0044, "num_tokens": 570220137.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.562234068567516, "frac_reward_zero_std": 1.0, "grad_norm": 0.12058554669767238, "kl": 0.44580078125, "learning_rate": 9.590341713163858e-06, "loss": 0.0045, "num_tokens": 570772361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.562625324008412, "frac_reward_zero_std": 1.0, "grad_norm": 0.09017234501869549, "kl": 0.435791015625, "learning_rate": 9.576694467221583e-06, "loss": 0.0044, "num_tokens": 571326777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5630165794493079, "frac_reward_zero_std": 1.0, "grad_norm": 0.10007359600853337, "kl": 0.427001953125, "learning_rate": 9.563048011044433e-06, "loss": 0.0043, "num_tokens": 571880009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.563407834890204, "frac_reward_zero_std": 1.0, "grad_norm": 0.09844132921940169, "kl": 0.434814453125, "learning_rate": 9.549402370092728e-06, "loss": 0.0043, "num_tokens": 572433161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5637990903310999, "frac_reward_zero_std": 1.0, "grad_norm": 0.17191124258743948, "kl": 0.437744140625, "learning_rate": 9.535757569825266e-06, "loss": 0.0044, "num_tokens": 572986665.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5641903457719959, "frac_reward_zero_std": 1.0, "grad_norm": 0.11408469403242644, "kl": 0.3994140625, "learning_rate": 9.522113635699281e-06, "loss": 0.004, "num_tokens": 573542329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5645816012128919, "frac_reward_zero_std": 1.0, "grad_norm": 0.3534442992289035, "kl": 0.3876953125, "learning_rate": 9.508470593170393e-06, "loss": 0.0039, "num_tokens": 574097321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5649728566537878, "frac_reward_zero_std": 1.0, "grad_norm": 0.12182395726532953, "kl": 0.375244140625, "learning_rate": 9.494828467692558e-06, "loss": 0.0038, "num_tokens": 574653401.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5653641120946838, "frac_reward_zero_std": 1.0, "grad_norm": 0.14520024387782715, "kl": 0.375732421875, "learning_rate": 9.481187284718005e-06, "loss": 0.0038, "num_tokens": 575206329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5657553675355798, "frac_reward_zero_std": 1.0, "grad_norm": 0.11064123339471826, "kl": 0.378662109375, "learning_rate": 9.467547069697227e-06, "loss": 0.0038, "num_tokens": 575760089.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5661466229764758, "frac_reward_zero_std": 1.0, "grad_norm": 0.0912774864856554, "kl": 0.37353515625, "learning_rate": 9.453907848078901e-06, "loss": 0.0037, "num_tokens": 576314121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5665378784173717, "frac_reward_zero_std": 1.0, "grad_norm": 0.11389275192238613, "kl": 0.375, "learning_rate": 9.44026964530985e-06, "loss": 0.0037, "num_tokens": 576867241.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5669291338582677, "frac_reward_zero_std": 1.0, "grad_norm": 0.09857179545796335, "kl": 0.364990234375, "learning_rate": 9.426632486834998e-06, "loss": 0.0036, "num_tokens": 577421033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5673203892991637, "frac_reward_zero_std": 1.0, "grad_norm": 0.11791353288648278, "kl": 0.3671875, "learning_rate": 9.412996398097325e-06, "loss": 0.0037, "num_tokens": 577975369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5677116447400596, "frac_reward_zero_std": 1.0, "grad_norm": 0.09336533340276951, "kl": 0.358642578125, "learning_rate": 9.3993614045378e-06, "loss": 0.0036, "num_tokens": 578529625.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5681029001809557, "frac_reward_zero_std": 1.0, "grad_norm": 0.10461290381361531, "kl": 0.362548828125, "learning_rate": 9.385727531595367e-06, "loss": 0.0036, "num_tokens": 579085657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5684941556218516, "frac_reward_zero_std": 1.0, "grad_norm": 0.11264160986753516, "kl": 0.362548828125, "learning_rate": 9.372094804706867e-06, "loss": 0.0036, "num_tokens": 579639129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5688854110627476, "frac_reward_zero_std": 1.0, "grad_norm": 0.07097819409876983, "kl": 0.354248046875, "learning_rate": 9.358463249307008e-06, "loss": 0.0035, "num_tokens": 580193369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5692766665036436, "frac_reward_zero_std": 1.0, "grad_norm": 0.06874265310067602, "kl": 0.36083984375, "learning_rate": 9.344832890828316e-06, "loss": 0.0036, "num_tokens": 580747033.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5696679219445395, "frac_reward_zero_std": 1.0, "grad_norm": 0.17971062335818666, "kl": 0.360595703125, "learning_rate": 9.331203754701075e-06, "loss": 0.0036, "num_tokens": 581301673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5700591773854355, "frac_reward_zero_std": 1.0, "grad_norm": 0.10062376353792703, "kl": 0.3603515625, "learning_rate": 9.317575866353293e-06, "loss": 0.0036, "num_tokens": 581854313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5704504328263315, "frac_reward_zero_std": 1.0, "grad_norm": 0.13510538511054168, "kl": 0.367431640625, "learning_rate": 9.303949251210646e-06, "loss": 0.0037, "num_tokens": 582408521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5708416882672275, "frac_reward_zero_std": 1.0, "grad_norm": 0.09514247957918665, "kl": 0.359375, "learning_rate": 9.290323934696447e-06, "loss": 0.0036, "num_tokens": 582962761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5712329437081234, "frac_reward_zero_std": 1.0, "grad_norm": 0.07106094766683621, "kl": 0.350830078125, "learning_rate": 9.27669994223157e-06, "loss": 0.0035, "num_tokens": 583517161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5716241991490194, "frac_reward_zero_std": 1.0, "grad_norm": 0.11448328519964801, "kl": 0.34326171875, "learning_rate": 9.263077299234433e-06, "loss": 0.0034, "num_tokens": 584071753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5720154545899154, "frac_reward_zero_std": 1.0, "grad_norm": 0.12975773322055414, "kl": 0.353759765625, "learning_rate": 9.249456031120922e-06, "loss": 0.0035, "num_tokens": 584625561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5724067100308113, "frac_reward_zero_std": 1.0, "grad_norm": 0.07626025629783996, "kl": 0.35693359375, "learning_rate": 9.23583616330437e-06, "loss": 0.0036, "num_tokens": 585180729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5727979654717074, "frac_reward_zero_std": 1.0, "grad_norm": 0.2661531630890652, "kl": 0.384765625, "learning_rate": 9.22221772119549e-06, "loss": 0.0038, "num_tokens": 585733945.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5731892209126033, "frac_reward_zero_std": 1.0, "grad_norm": 0.11871379834189526, "kl": 0.365966796875, "learning_rate": 9.20860073020234e-06, "loss": 0.0037, "num_tokens": 586286121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5735804763534993, "frac_reward_zero_std": 1.0, "grad_norm": 0.08767737037457211, "kl": 0.369384765625, "learning_rate": 9.19498521573027e-06, "loss": 0.0037, "num_tokens": 586838553.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5739717317943953, "frac_reward_zero_std": 1.0, "grad_norm": 0.16951393854432503, "kl": 0.37451171875, "learning_rate": 9.181371203181873e-06, "loss": 0.0037, "num_tokens": 587394009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5743629872352912, "frac_reward_zero_std": 1.0, "grad_norm": 0.1208451630434129, "kl": 0.375, "learning_rate": 9.167758717956934e-06, "loss": 0.0037, "num_tokens": 587947465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5747542426761872, "frac_reward_zero_std": 1.0, "grad_norm": 0.09746663551217356, "kl": 0.369384765625, "learning_rate": 9.154147785452401e-06, "loss": 0.0037, "num_tokens": 588499769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5751454981170832, "frac_reward_zero_std": 1.0, "grad_norm": 0.1054226722385752, "kl": 0.353759765625, "learning_rate": 9.14053843106232e-06, "loss": 0.0035, "num_tokens": 589053577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5755367535579792, "frac_reward_zero_std": 1.0, "grad_norm": 0.13504300710718115, "kl": 0.340576171875, "learning_rate": 9.126930680177788e-06, "loss": 0.0034, "num_tokens": 589607161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5759280089988752, "frac_reward_zero_std": 1.0, "grad_norm": 0.16515346086033966, "kl": 0.338623046875, "learning_rate": 9.113324558186922e-06, "loss": 0.0034, "num_tokens": 590161353.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5763192644397711, "frac_reward_zero_std": 1.0, "grad_norm": 0.11591530656365959, "kl": 0.3203125, "learning_rate": 9.099720090474779e-06, "loss": 0.0032, "num_tokens": 590714185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5767105198806671, "frac_reward_zero_std": 1.0, "grad_norm": 0.09802225386526339, "kl": 0.31298828125, "learning_rate": 9.086117302423353e-06, "loss": 0.0031, "num_tokens": 591265433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.577101775321563, "frac_reward_zero_std": 1.0, "grad_norm": 0.18741986939000518, "kl": 0.33447265625, "learning_rate": 9.07251621941149e-06, "loss": 0.0033, "num_tokens": 591819561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5774930307624591, "frac_reward_zero_std": 1.0, "grad_norm": 0.13323200745444472, "kl": 0.335205078125, "learning_rate": 9.058916866814857e-06, "loss": 0.0033, "num_tokens": 592375385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.577884286203355, "frac_reward_zero_std": 1.0, "grad_norm": 0.13808747538764035, "kl": 0.35546875, "learning_rate": 9.0453192700059e-06, "loss": 0.0036, "num_tokens": 592929769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.578275541644251, "frac_reward_zero_std": 1.0, "grad_norm": 0.11576872429412974, "kl": 0.373291015625, "learning_rate": 9.03172345435378e-06, "loss": 0.0037, "num_tokens": 593482921.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.578666797085147, "frac_reward_zero_std": 1.0, "grad_norm": 0.10466656808656208, "kl": 0.376220703125, "learning_rate": 9.01812944522434e-06, "loss": 0.0038, "num_tokens": 594037673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5790580525260429, "frac_reward_zero_std": 1.0, "grad_norm": 0.11199705868735342, "kl": 0.3681640625, "learning_rate": 9.004537267980046e-06, "loss": 0.0037, "num_tokens": 594591049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5794493079669389, "frac_reward_zero_std": 1.0, "grad_norm": 0.13834455457585476, "kl": 0.365234375, "learning_rate": 8.990946947979955e-06, "loss": 0.0037, "num_tokens": 595143369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5798405634078349, "frac_reward_zero_std": 1.0, "grad_norm": 0.13919900496136178, "kl": 0.359130859375, "learning_rate": 8.977358510579658e-06, "loss": 0.0036, "num_tokens": 595698169.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5802318188487309, "frac_reward_zero_std": 1.0, "grad_norm": 0.1370612710786989, "kl": 0.35986328125, "learning_rate": 8.963771981131227e-06, "loss": 0.0036, "num_tokens": 596251497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5806230742896269, "frac_reward_zero_std": 1.0, "grad_norm": 0.9368269564842708, "kl": 0.388427734375, "learning_rate": 8.95018738498318e-06, "loss": 0.0039, "num_tokens": 596805977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5810143297305228, "frac_reward_zero_std": 1.0, "grad_norm": 0.6471307841506277, "kl": 0.47314453125, "learning_rate": 8.936604747480422e-06, "loss": 0.0047, "num_tokens": 597359721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5814055851714188, "frac_reward_zero_std": 1.0, "grad_norm": 0.5214316886145748, "kl": 0.4736328125, "learning_rate": 8.923024093964214e-06, "loss": 0.0047, "num_tokens": 597912985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5817968406123147, "frac_reward_zero_std": 1.0, "grad_norm": 0.23992582706219953, "kl": 0.429443359375, "learning_rate": 8.909445449772103e-06, "loss": 0.0043, "num_tokens": 598472025.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5821880960532108, "frac_reward_zero_std": 1.0, "grad_norm": 0.1923591471989458, "kl": 0.400634765625, "learning_rate": 8.895868840237896e-06, "loss": 0.004, "num_tokens": 599025529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5825793514941067, "frac_reward_zero_std": 1.0, "grad_norm": 0.20516795420079958, "kl": 0.3623046875, "learning_rate": 8.882294290691609e-06, "loss": 0.0036, "num_tokens": 599578953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5829706069350027, "frac_reward_zero_std": 1.0, "grad_norm": 0.16097646701471877, "kl": 0.353271484375, "learning_rate": 8.868721826459396e-06, "loss": 0.0035, "num_tokens": 600135529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5833618623758987, "frac_reward_zero_std": 1.0, "grad_norm": 0.1216250437348178, "kl": 0.337158203125, "learning_rate": 8.855151472863533e-06, "loss": 0.0034, "num_tokens": 600690521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5837531178167946, "frac_reward_zero_std": 1.0, "grad_norm": 0.10884799734005007, "kl": 0.337158203125, "learning_rate": 8.841583255222359e-06, "loss": 0.0034, "num_tokens": 601243465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5841443732576906, "frac_reward_zero_std": 1.0, "grad_norm": 0.16071280698881624, "kl": 0.338623046875, "learning_rate": 8.828017198850228e-06, "loss": 0.0034, "num_tokens": 601797321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5845356286985866, "frac_reward_zero_std": 1.0, "grad_norm": 0.15541913880271796, "kl": 0.33984375, "learning_rate": 8.814453329057455e-06, "loss": 0.0034, "num_tokens": 602352793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5849268841394826, "frac_reward_zero_std": 1.0, "grad_norm": 0.12153137687623608, "kl": 0.332763671875, "learning_rate": 8.800891671150286e-06, "loss": 0.0033, "num_tokens": 602908009.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5853181395803786, "frac_reward_zero_std": 1.0, "grad_norm": 0.10614619229115864, "kl": 0.32470703125, "learning_rate": 8.787332250430824e-06, "loss": 0.0033, "num_tokens": 603461481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5857093950212745, "frac_reward_zero_std": 1.0, "grad_norm": 0.12440102291829712, "kl": 0.330322265625, "learning_rate": 8.773775092197018e-06, "loss": 0.0033, "num_tokens": 604016249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5861006504621705, "frac_reward_zero_std": 1.0, "grad_norm": 0.12698845093914884, "kl": 0.34619140625, "learning_rate": 8.760220221742578e-06, "loss": 0.0035, "num_tokens": 604570809.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5864919059030664, "frac_reward_zero_std": 1.0, "grad_norm": 0.12614816029495937, "kl": 0.362060546875, "learning_rate": 8.746667664356957e-06, "loss": 0.0036, "num_tokens": 605126185.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5868831613439625, "frac_reward_zero_std": 1.0, "grad_norm": 0.11267300923088844, "kl": 0.390625, "learning_rate": 8.733117445325293e-06, "loss": 0.0039, "num_tokens": 605681577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5872744167848584, "frac_reward_zero_std": 1.0, "grad_norm": 0.13141517903517486, "kl": 0.410888671875, "learning_rate": 8.719569589928353e-06, "loss": 0.0041, "num_tokens": 606236217.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5876656722257544, "frac_reward_zero_std": 1.0, "grad_norm": 0.13257813448348377, "kl": 0.404296875, "learning_rate": 8.706024123442497e-06, "loss": 0.004, "num_tokens": 606789641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5880569276666504, "frac_reward_zero_std": 1.0, "grad_norm": 0.12523425938984595, "kl": 0.40234375, "learning_rate": 8.69248107113963e-06, "loss": 0.004, "num_tokens": 607345689.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5884481831075463, "frac_reward_zero_std": 1.0, "grad_norm": 0.15899105452978546, "kl": 0.39990234375, "learning_rate": 8.67894045828716e-06, "loss": 0.004, "num_tokens": 607899609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5888394385484423, "frac_reward_zero_std": 1.0, "grad_norm": 0.13923877432563853, "kl": 0.389404296875, "learning_rate": 8.665402310147924e-06, "loss": 0.0039, "num_tokens": 608453481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5892306939893383, "frac_reward_zero_std": 1.0, "grad_norm": 0.1314346718273789, "kl": 0.370361328125, "learning_rate": 8.651866651980184e-06, "loss": 0.0037, "num_tokens": 609008617.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5896219494302343, "frac_reward_zero_std": 1.0, "grad_norm": 0.11732369247184407, "kl": 0.36474609375, "learning_rate": 8.638333509037537e-06, "loss": 0.0036, "num_tokens": 609562889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5900132048711303, "frac_reward_zero_std": 1.0, "grad_norm": 0.1052079086549568, "kl": 0.36767578125, "learning_rate": 8.624802906568895e-06, "loss": 0.0037, "num_tokens": 610115705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5904044603120262, "frac_reward_zero_std": 1.0, "grad_norm": 0.10150290715985874, "kl": 0.355224609375, "learning_rate": 8.611274869818437e-06, "loss": 0.0035, "num_tokens": 610671721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5907957157529222, "frac_reward_zero_std": 1.0, "grad_norm": 0.11808829856853717, "kl": 0.365234375, "learning_rate": 8.597749424025544e-06, "loss": 0.0037, "num_tokens": 611224105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5911869711938181, "frac_reward_zero_std": 1.0, "grad_norm": 0.12231644420329296, "kl": 0.368896484375, "learning_rate": 8.584226594424772e-06, "loss": 0.0037, "num_tokens": 611777785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5915782266347142, "frac_reward_zero_std": 1.0, "grad_norm": 0.0980282778822759, "kl": 0.369140625, "learning_rate": 8.570706406245787e-06, "loss": 0.0037, "num_tokens": 612333993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5919694820756101, "frac_reward_zero_std": 1.0, "grad_norm": 0.17560104599413542, "kl": 0.38232421875, "learning_rate": 8.557188884713334e-06, "loss": 0.0038, "num_tokens": 612891305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5923607375165061, "frac_reward_zero_std": 1.0, "grad_norm": 0.10376553849707588, "kl": 0.369873046875, "learning_rate": 8.543674055047177e-06, "loss": 0.0037, "num_tokens": 613443129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5927519929574021, "frac_reward_zero_std": 1.0, "grad_norm": 0.15548416259653383, "kl": 0.380859375, "learning_rate": 8.530161942462063e-06, "loss": 0.0038, "num_tokens": 613998425.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.593143248398298, "frac_reward_zero_std": 1.0, "grad_norm": 0.07896179121628158, "kl": 0.3623046875, "learning_rate": 8.516652572167672e-06, "loss": 0.0036, "num_tokens": 614553545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.593534503839194, "frac_reward_zero_std": 1.0, "grad_norm": 0.08988485566670947, "kl": 0.361083984375, "learning_rate": 8.503145969368562e-06, "loss": 0.0036, "num_tokens": 615107785.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.59392575928009, "frac_reward_zero_std": 1.0, "grad_norm": 0.10972427706125608, "kl": 0.359130859375, "learning_rate": 8.489642159264123e-06, "loss": 0.0036, "num_tokens": 615661273.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.594317014720986, "frac_reward_zero_std": 1.0, "grad_norm": 0.09475374480819498, "kl": 0.35205078125, "learning_rate": 8.476141167048543e-06, "loss": 0.0035, "num_tokens": 616217705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.594708270161882, "frac_reward_zero_std": 1.0, "grad_norm": 0.10444553277991254, "kl": 0.35986328125, "learning_rate": 8.462643017910756e-06, "loss": 0.0036, "num_tokens": 616769849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5950995256027779, "frac_reward_zero_std": 1.0, "grad_norm": 0.08618509672332193, "kl": 0.352783203125, "learning_rate": 8.44914773703438e-06, "loss": 0.0035, "num_tokens": 617324441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5954907810436739, "frac_reward_zero_std": 1.0, "grad_norm": 0.06835866581023685, "kl": 0.35205078125, "learning_rate": 8.43565534959769e-06, "loss": 0.0035, "num_tokens": 617876745.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5958820364845698, "frac_reward_zero_std": 1.0, "grad_norm": 0.06252634955982209, "kl": 0.348876953125, "learning_rate": 8.422165880773566e-06, "loss": 0.0035, "num_tokens": 618431305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5962732919254659, "frac_reward_zero_std": 1.0, "grad_norm": 0.29821174426478586, "kl": 0.374755859375, "learning_rate": 8.408679355729429e-06, "loss": 0.0037, "num_tokens": 618985897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5966645473663618, "frac_reward_zero_std": 1.0, "grad_norm": 0.11160181622877588, "kl": 0.359130859375, "learning_rate": 8.395195799627217e-06, "loss": 0.0036, "num_tokens": 619541129.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5970558028072578, "frac_reward_zero_std": 1.0, "grad_norm": 0.13323256105019166, "kl": 0.367431640625, "learning_rate": 8.381715237623329e-06, "loss": 0.0037, "num_tokens": 620097721.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5974470582481538, "frac_reward_zero_std": 1.0, "grad_norm": 0.12076276612066693, "kl": 0.371337890625, "learning_rate": 8.36823769486858e-06, "loss": 0.0037, "num_tokens": 620651849.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5978383136890497, "frac_reward_zero_std": 1.0, "grad_norm": 0.14205399422804427, "kl": 0.373779296875, "learning_rate": 8.354763196508143e-06, "loss": 0.0037, "num_tokens": 621205593.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5982295691299457, "frac_reward_zero_std": 1.0, "grad_norm": 0.09181305447067424, "kl": 0.371826171875, "learning_rate": 8.341291767681523e-06, "loss": 0.0037, "num_tokens": 621760105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5986208245708416, "frac_reward_zero_std": 1.0, "grad_norm": 0.08440268676549904, "kl": 0.368896484375, "learning_rate": 8.327823433522484e-06, "loss": 0.0037, "num_tokens": 622312761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5990120800117377, "frac_reward_zero_std": 1.0, "grad_norm": 0.09026796208343668, "kl": 0.364990234375, "learning_rate": 8.31435821915903e-06, "loss": 0.0036, "num_tokens": 622867417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5994033354526337, "frac_reward_zero_std": 1.0, "grad_norm": 0.09565174389442009, "kl": 0.3701171875, "learning_rate": 8.300896149713334e-06, "loss": 0.0037, "num_tokens": 623422041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.5997945908935296, "frac_reward_zero_std": 1.0, "grad_norm": 0.0935437486033816, "kl": 0.369384765625, "learning_rate": 8.287437250301708e-06, "loss": 0.0037, "num_tokens": 623975385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6001858463344256, "frac_reward_zero_std": 1.0, "grad_norm": 0.10636434853622316, "kl": 0.36328125, "learning_rate": 8.273981546034554e-06, "loss": 0.0036, "num_tokens": 624527257.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6005771017753215, "frac_reward_zero_std": 1.0, "grad_norm": 0.07567659432119535, "kl": 0.358642578125, "learning_rate": 8.260529062016294e-06, "loss": 0.0036, "num_tokens": 625081001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6009683572162176, "frac_reward_zero_std": 1.0, "grad_norm": 0.058015149951963135, "kl": 0.35986328125, "learning_rate": 8.247079823345363e-06, "loss": 0.0036, "num_tokens": 625633081.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6013596126571135, "frac_reward_zero_std": 1.0, "grad_norm": 0.10302303950318975, "kl": 0.368408203125, "learning_rate": 8.233633855114127e-06, "loss": 0.0037, "num_tokens": 626185321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6017508680980095, "frac_reward_zero_std": 1.0, "grad_norm": 1.1896919063978688, "kl": 0.48681640625, "learning_rate": 8.22019118240886e-06, "loss": 0.0049, "num_tokens": 626738233.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6021421235389055, "frac_reward_zero_std": 1.0, "grad_norm": 0.09177790202335188, "kl": 0.364990234375, "learning_rate": 8.206751830309681e-06, "loss": 0.0037, "num_tokens": 627291049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6025333789798014, "frac_reward_zero_std": 1.0, "grad_norm": 0.13039855830018193, "kl": 0.368896484375, "learning_rate": 8.193315823890519e-06, "loss": 0.0037, "num_tokens": 627846281.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6029246344206974, "frac_reward_zero_std": 1.0, "grad_norm": 0.14798471551695355, "kl": 0.385009765625, "learning_rate": 8.179883188219052e-06, "loss": 0.0038, "num_tokens": 628398841.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6033158898615933, "frac_reward_zero_std": 1.0, "grad_norm": 0.16010795820442444, "kl": 0.38037109375, "learning_rate": 8.166453948356679e-06, "loss": 0.0038, "num_tokens": 628952393.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6037071453024894, "frac_reward_zero_std": 1.0, "grad_norm": 0.13076731077558001, "kl": 0.3779296875, "learning_rate": 8.153028129358458e-06, "loss": 0.0038, "num_tokens": 629505801.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6040984007433854, "frac_reward_zero_std": 1.0, "grad_norm": 0.11158517772584571, "kl": 0.365478515625, "learning_rate": 8.139605756273067e-06, "loss": 0.0037, "num_tokens": 630060761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6044896561842813, "frac_reward_zero_std": 0.9375, "grad_norm": 0.34706484288248074, "kl": 0.3603515625, "learning_rate": 8.126186854142752e-06, "loss": 0.0036, "num_tokens": 630615673.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6048809116251773, "frac_reward_zero_std": 1.0, "grad_norm": 0.0923549827167846, "kl": 0.35400390625, "learning_rate": 8.112771448003292e-06, "loss": 0.0035, "num_tokens": 631169897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6052721670660732, "frac_reward_zero_std": 1.0, "grad_norm": 0.10092047577334774, "kl": 0.349853515625, "learning_rate": 8.099359562883931e-06, "loss": 0.0035, "num_tokens": 631724041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6056634225069693, "frac_reward_zero_std": 1.0, "grad_norm": 0.12380445868376336, "kl": 0.360595703125, "learning_rate": 8.085951223807344e-06, "loss": 0.0036, "num_tokens": 632276697.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6060546779478652, "frac_reward_zero_std": 1.0, "grad_norm": 0.11065786417876142, "kl": 0.359375, "learning_rate": 8.072546455789605e-06, "loss": 0.0036, "num_tokens": 632830953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6064459333887612, "frac_reward_zero_std": 1.0, "grad_norm": 0.10682113213123673, "kl": 0.3515625, "learning_rate": 8.059145283840114e-06, "loss": 0.0035, "num_tokens": 633385705.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6068371888296572, "frac_reward_zero_std": 0.9375, "grad_norm": 0.28478761963153454, "kl": 0.358642578125, "learning_rate": 8.045747732961563e-06, "loss": 0.0036, "num_tokens": 633938953.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6072284442705531, "frac_reward_zero_std": 0.9375, "grad_norm": 0.326467623575825, "kl": 0.354248046875, "learning_rate": 8.032353828149889e-06, "loss": 0.0035, "num_tokens": 634492409.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6076196997114491, "frac_reward_zero_std": 1.0, "grad_norm": 0.12482666436580288, "kl": 0.3505859375, "learning_rate": 8.018963594394221e-06, "loss": 0.0035, "num_tokens": 635045881.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.608010955152345, "frac_reward_zero_std": 0.9375, "grad_norm": 0.27696744491321956, "kl": 0.3623046875, "learning_rate": 8.005577056676854e-06, "loss": 0.0036, "num_tokens": 635598057.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6084022105932411, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2934628844091582, "kl": 0.36328125, "learning_rate": 7.99219423997317e-06, "loss": 0.0036, "num_tokens": 636152121.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1555 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6087934660341371, "frac_reward_zero_std": 0.9375, "grad_norm": 0.28703559340035695, "kl": 0.38818359375, "learning_rate": 7.97881516925162e-06, "loss": 0.0039, "num_tokens": 636706953.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1556 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.609184721475033, "frac_reward_zero_std": 1.0, "grad_norm": 0.12684757392077917, "kl": 0.3935546875, "learning_rate": 7.965439869473664e-06, "loss": 0.0039, "num_tokens": 637260441.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1557 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.609575976915929, "frac_reward_zero_std": 1.0, "grad_norm": 0.16016369304387762, "kl": 0.408203125, "learning_rate": 7.952068365593722e-06, "loss": 0.0041, "num_tokens": 637815913.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1558 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6099672323568249, "frac_reward_zero_std": 1.0, "grad_norm": 0.1782877996346896, "kl": 0.41259765625, "learning_rate": 7.938700682559133e-06, "loss": 0.0041, "num_tokens": 638369321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1559 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.610358487797721, "frac_reward_zero_std": 1.0, "grad_norm": 0.14439031139612496, "kl": 0.398193359375, "learning_rate": 7.925336845310111e-06, "loss": 0.004, "num_tokens": 638925657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1560 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6107497432386169, "frac_reward_zero_std": 1.0, "grad_norm": 0.11519347872975934, "kl": 0.400146484375, "learning_rate": 7.911976878779696e-06, "loss": 0.004, "num_tokens": 639480249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1561 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6111409986795129, "frac_reward_zero_std": 1.0, "grad_norm": 0.1047280535551468, "kl": 0.380615234375, "learning_rate": 7.898620807893698e-06, "loss": 0.0038, "num_tokens": 640036489.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1562 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6115322541204089, "frac_reward_zero_std": 1.0, "grad_norm": 0.09800037849698226, "kl": 0.380615234375, "learning_rate": 7.885268657570674e-06, "loss": 0.0038, "num_tokens": 640589049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1563 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6119235095613048, "frac_reward_zero_std": 0.9375, "grad_norm": 0.254413647652189, "kl": 0.364501953125, "learning_rate": 7.871920452721844e-06, "loss": 0.0036, "num_tokens": 641143817.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1564 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6123147650022008, "frac_reward_zero_std": 1.0, "grad_norm": 0.10971203656980995, "kl": 0.36669921875, "learning_rate": 7.858576218251089e-06, "loss": 0.0037, "num_tokens": 641695161.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1565 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6127060204430967, "frac_reward_zero_std": 1.0, "grad_norm": 0.2127732698615863, "kl": 0.3525390625, "learning_rate": 7.845235979054868e-06, "loss": 0.0035, "num_tokens": 642247481.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1566 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6130972758839928, "frac_reward_zero_std": 1.0, "grad_norm": 0.11467690979606984, "kl": 0.359130859375, "learning_rate": 7.831899760022192e-06, "loss": 0.0036, "num_tokens": 642799577.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1567 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6134885313248888, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2877953275781148, "kl": 0.373291015625, "learning_rate": 7.818567586034578e-06, "loss": 0.0037, "num_tokens": 643351769.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1568 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6138797867657847, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2767099653107926, "kl": 0.373291015625, "learning_rate": 7.805239481965976e-06, "loss": 0.0037, "num_tokens": 643904985.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1569 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6142710422066807, "frac_reward_zero_std": 1.0, "grad_norm": 0.12800620618054273, "kl": 0.3837890625, "learning_rate": 7.791915472682762e-06, "loss": 0.0038, "num_tokens": 644460601.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1570 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6146622976475766, "frac_reward_zero_std": 1.0, "grad_norm": 0.19385266326307354, "kl": 0.398193359375, "learning_rate": 7.778595583043667e-06, "loss": 0.004, "num_tokens": 645011305.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1571 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6150535530884726, "frac_reward_zero_std": 1.0, "grad_norm": 0.12519698709645966, "kl": 0.374267578125, "learning_rate": 7.76527983789973e-06, "loss": 0.0037, "num_tokens": 645564473.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1572 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6154448085293686, "frac_reward_zero_std": 1.0, "grad_norm": 0.10626003308150891, "kl": 0.362060546875, "learning_rate": 7.75196826209427e-06, "loss": 0.0036, "num_tokens": 646122057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1573 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6158360639702646, "frac_reward_zero_std": 0.9375, "grad_norm": 0.281420966527748, "kl": 0.359375, "learning_rate": 7.738660880462813e-06, "loss": 0.0036, "num_tokens": 646676025.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1574 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6162273194111606, "frac_reward_zero_std": 1.0, "grad_norm": 0.15585501624427622, "kl": 0.358642578125, "learning_rate": 7.725357717833067e-06, "loss": 0.0036, "num_tokens": 647231641.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1575 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6166185748520565, "frac_reward_zero_std": 1.0, "grad_norm": 0.11687763170507841, "kl": 0.348876953125, "learning_rate": 7.712058799024868e-06, "loss": 0.0035, "num_tokens": 647785993.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1576 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6170098302929525, "frac_reward_zero_std": 1.0, "grad_norm": 0.12787447247215689, "kl": 0.350830078125, "learning_rate": 7.698764148850138e-06, "loss": 0.0035, "num_tokens": 648340793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1577 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6174010857338484, "frac_reward_zero_std": 1.0, "grad_norm": 0.12637881539021617, "kl": 0.368408203125, "learning_rate": 7.685473792112824e-06, "loss": 0.0037, "num_tokens": 648892249.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1578 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6177923411747445, "frac_reward_zero_std": 1.0, "grad_norm": 0.140120541656043, "kl": 0.3583984375, "learning_rate": 7.672187753608881e-06, "loss": 0.0036, "num_tokens": 649446409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1579 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6181835966156405, "frac_reward_zero_std": 1.0, "grad_norm": 0.11453853283054279, "kl": 0.3603515625, "learning_rate": 7.658906058126183e-06, "loss": 0.0036, "num_tokens": 650000521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1580 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6185748520565364, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3746765880985309, "kl": 0.364990234375, "learning_rate": 7.645628730444524e-06, "loss": 0.0036, "num_tokens": 650553081.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1581 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6189661074974324, "frac_reward_zero_std": 1.0, "grad_norm": 0.076031187630996, "kl": 0.353271484375, "learning_rate": 7.632355795335533e-06, "loss": 0.0035, "num_tokens": 651105673.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1582 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6193573629383283, "frac_reward_zero_std": 1.0, "grad_norm": 0.08349252622611092, "kl": 0.364501953125, "learning_rate": 7.619087277562656e-06, "loss": 0.0036, "num_tokens": 651660329.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1583 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6197486183792243, "frac_reward_zero_std": 1.0, "grad_norm": 0.11763927290609463, "kl": 0.369140625, "learning_rate": 7.605823201881089e-06, "loss": 0.0037, "num_tokens": 652214377.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1584 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6201398738201203, "frac_reward_zero_std": 0.9375, "grad_norm": 0.28119669777300754, "kl": 0.367919921875, "learning_rate": 7.592563593037746e-06, "loss": 0.0037, "num_tokens": 652769321.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1585 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6205311292610163, "frac_reward_zero_std": 1.0, "grad_norm": 0.10636373899388796, "kl": 0.38916015625, "learning_rate": 7.579308475771196e-06, "loss": 0.0039, "num_tokens": 653321753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1586 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6209223847019123, "frac_reward_zero_std": 1.0, "grad_norm": 0.14657572738029426, "kl": 0.388671875, "learning_rate": 7.566057874811643e-06, "loss": 0.0039, "num_tokens": 653877465.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1587 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6213136401428082, "frac_reward_zero_std": 0.875, "grad_norm": 0.4483953706819393, "kl": 0.395263671875, "learning_rate": 7.552811814880858e-06, "loss": 0.0039, "num_tokens": 654430185.0, "reward": 0.0003662109375, "reward_std": 0.0011554004158824682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1588 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6217048955837042, "frac_reward_zero_std": 1.0, "grad_norm": 0.14057288919575264, "kl": 0.396728515625, "learning_rate": 7.539570320692137e-06, "loss": 0.004, "num_tokens": 654983753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1589 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6220961510246001, "frac_reward_zero_std": 0.8125, "grad_norm": 0.5242184352187768, "kl": 0.394775390625, "learning_rate": 7.526333416950262e-06, "loss": 0.0039, "num_tokens": 655535657.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1590 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6224874064654962, "frac_reward_zero_std": 0.8125, "grad_norm": 0.44975662604436006, "kl": 0.37744140625, "learning_rate": 7.513101128351454e-06, "loss": 0.0038, "num_tokens": 656088569.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1591 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6228786619063922, "frac_reward_zero_std": 0.9375, "grad_norm": 0.27151188618476113, "kl": 0.3681640625, "learning_rate": 7.499873479583312e-06, "loss": 0.0037, "num_tokens": 656641465.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1592 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6232699173472881, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2877574264387494, "kl": 0.36181640625, "learning_rate": 7.486650495324783e-06, "loss": 0.0036, "num_tokens": 657196809.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1593 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6236611727881841, "frac_reward_zero_std": 1.0, "grad_norm": 0.20679720591384607, "kl": 0.3623046875, "learning_rate": 7.473432200246118e-06, "loss": 0.0036, "num_tokens": 657748793.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1594 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.62405242822908, "frac_reward_zero_std": 1.0, "grad_norm": 0.2047002630765954, "kl": 0.3515625, "learning_rate": 7.460218619008817e-06, "loss": 0.0035, "num_tokens": 658302057.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1595 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.624443683669976, "frac_reward_zero_std": 1.0, "grad_norm": 0.17178048786845349, "kl": 0.35009765625, "learning_rate": 7.447009776265578e-06, "loss": 0.0035, "num_tokens": 658855817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1596 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.624834939110872, "frac_reward_zero_std": 1.0, "grad_norm": 0.13704193793789554, "kl": 0.357421875, "learning_rate": 7.433805696660267e-06, "loss": 0.0036, "num_tokens": 659409369.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1597 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.625226194551768, "frac_reward_zero_std": 1.0, "grad_norm": 0.1371016236157762, "kl": 0.357666015625, "learning_rate": 7.420606404827855e-06, "loss": 0.0036, "num_tokens": 659965193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1598 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.625617449992664, "frac_reward_zero_std": 1.0, "grad_norm": 0.13883475371330284, "kl": 0.365478515625, "learning_rate": 7.407411925394389e-06, "loss": 0.0037, "num_tokens": 660518505.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1599 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6260087054335599, "frac_reward_zero_std": 1.0, "grad_norm": 0.16742017755739386, "kl": 0.374267578125, "learning_rate": 7.394222282976935e-06, "loss": 0.0037, "num_tokens": 661072105.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1600 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6263999608744559, "frac_reward_zero_std": 0.9375, "grad_norm": 0.28702337754814217, "kl": 0.3662109375, "learning_rate": 7.3810375021835275e-06, "loss": 0.0037, "num_tokens": 661627145.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1601 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6267912163153518, "frac_reward_zero_std": 0.875, "grad_norm": 0.371075024248334, "kl": 0.375, "learning_rate": 7.367857607613147e-06, "loss": 0.0037, "num_tokens": 662180601.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1602 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6271824717562479, "frac_reward_zero_std": 1.0, "grad_norm": 0.14666152607363123, "kl": 0.36474609375, "learning_rate": 7.354682623855635e-06, "loss": 0.0037, "num_tokens": 662734537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1603 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6275737271971439, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2407021806723514, "kl": 0.353515625, "learning_rate": 7.341512575491689e-06, "loss": 0.0035, "num_tokens": 663290745.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1604 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6279649826380398, "frac_reward_zero_std": 0.9375, "grad_norm": 0.33544577084616695, "kl": 0.352783203125, "learning_rate": 7.3283474870927905e-06, "loss": 0.0035, "num_tokens": 663845609.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1605 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6283562380789358, "frac_reward_zero_std": 0.875, "grad_norm": 0.37501163104079044, "kl": 0.3544921875, "learning_rate": 7.315187383221169e-06, "loss": 0.0035, "num_tokens": 664398777.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1606 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6287474935198317, "frac_reward_zero_std": 0.75, "grad_norm": 0.5070382729294408, "kl": 0.3720703125, "learning_rate": 7.3020322884297565e-06, "loss": 0.0037, "num_tokens": 664951193.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 1607 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6291387489607277, "frac_reward_zero_std": 0.5, "grad_norm": 0.797443829076179, "kl": 0.377685546875, "learning_rate": 7.28888222726214e-06, "loss": 0.0038, "num_tokens": 665506777.0, "reward": 0.00146484375, "reward_std": 0.004562974441796541, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01171875, "rewards/tag_count_reward/std": 0.05294628441333771, "step": 1608 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6295300044016237, "frac_reward_zero_std": 0.375, "grad_norm": 1.1401688955386884, "kl": 0.4091796875, "learning_rate": 7.275737224252504e-06, "loss": 0.0041, "num_tokens": 666061705.0, "reward": 0.0020751953125, "reward_std": 0.0060760509222745895, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0166015625, "rewards/tag_count_reward/std": 0.062369659543037415, "step": 1609 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6299212598425197, "frac_reward_zero_std": 0.0, "grad_norm": 1.6499554069131546, "kl": 0.49609375, "learning_rate": 7.2625973039256094e-06, "loss": 0.005, "num_tokens": 666617689.0, "reward": 0.0067138671875, "reward_std": 0.012727686204016209, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0537109375, "rewards/tag_count_reward/std": 0.1052350252866745, "step": 1610 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6303125152834157, "frac_reward_zero_std": 0.0, "grad_norm": 2.10560315165211, "kl": 0.6748046875, "learning_rate": 7.24946249079673e-06, "loss": 0.0067, "num_tokens": 667170969.0, "reward": 0.0091552734375, "reward_std": 0.014196117408573627, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0732421875, "rewards/tag_count_reward/std": 0.11400394141674042, "step": 1611 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6307037707243116, "frac_reward_zero_std": 0.0625, "grad_norm": 3.571364132256207, "kl": 0.5888671875, "learning_rate": 7.236332809371609e-06, "loss": 0.0059, "num_tokens": 667724393.0, "reward": 0.0067138671875, "reward_std": 0.012518822215497494, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0537109375, "rewards/tag_count_reward/std": 0.1028796136379242, "step": 1612 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6310950261652076, "frac_reward_zero_std": 0.0, "grad_norm": 6.385338339330282, "kl": 0.6552734375, "learning_rate": 7.223208284146421e-06, "loss": 0.0065, "num_tokens": 668275705.0, "reward": 0.0052490234375, "reward_std": 0.01145400945097208, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0419921875, "rewards/tag_count_reward/std": 0.0936427116394043, "step": 1613 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6314862816061035, "frac_reward_zero_std": 0.0625, "grad_norm": 1.4318716506605433, "kl": 0.564453125, "learning_rate": 7.210088939607709e-06, "loss": 0.0056, "num_tokens": 668831353.0, "reward": 0.0054931640625, "reward_std": 0.011164426803588867, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0439453125, "rewards/tag_count_reward/std": 0.0953448936343193, "step": 1614 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6318775370469996, "frac_reward_zero_std": 0.0, "grad_norm": 2.4382418459452944, "kl": 0.58740234375, "learning_rate": 7.196974800232364e-06, "loss": 0.0059, "num_tokens": 669386201.0, "reward": 0.005859375, "reward_std": 0.012133578769862652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.046875, "rewards/tag_count_reward/std": 0.09776923805475235, "step": 1615 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6322687924878956, "frac_reward_zero_std": 0.0, "grad_norm": 1.3101156819452524, "kl": 0.61669921875, "learning_rate": 7.183865890487554e-06, "loss": 0.0062, "num_tokens": 669940249.0, "reward": 0.0062255859375, "reward_std": 0.012498813681304455, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0498046875, "rewards/tag_count_reward/std": 0.10004881769418716, "step": 1616 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6326600479287915, "frac_reward_zero_std": 0.0, "grad_norm": 1.6369459333021, "kl": 0.736328125, "learning_rate": 7.1707622348307e-06, "loss": 0.0074, "num_tokens": 670493129.0, "reward": 0.0123291015625, "reward_std": 0.015313942916691303, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0986328125, "rewards/tag_count_reward/std": 0.12242680042982101, "step": 1617 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6330513033696875, "frac_reward_zero_std": 0.0, "grad_norm": 2.8196301436667075, "kl": 0.71826171875, "learning_rate": 7.157663857709416e-06, "loss": 0.0072, "num_tokens": 671045465.0, "reward": 0.00732421875, "reward_std": 0.012718389742076397, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.05859375, "rewards/tag_count_reward/std": 0.10610933601856232, "step": 1618 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6334425588105834, "frac_reward_zero_std": 0.0625, "grad_norm": 1.6627622615547062, "kl": 0.7109375, "learning_rate": 7.144570783561468e-06, "loss": 0.0071, "num_tokens": 671601065.0, "reward": 0.00439453125, "reward_std": 0.010449940338730812, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.03515625, "rewards/tag_count_reward/std": 0.08707881718873978, "step": 1619 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6338338142514794, "frac_reward_zero_std": 0.8125, "grad_norm": 0.8006125441335165, "kl": 0.66943359375, "learning_rate": 7.131483036814721e-06, "loss": 0.0067, "num_tokens": 672153273.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1620 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6342250696923755, "frac_reward_zero_std": 0.8125, "grad_norm": 0.7212102143055718, "kl": 0.58056640625, "learning_rate": 7.118400641887116e-06, "loss": 0.0058, "num_tokens": 672708217.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1621 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6346163251332714, "frac_reward_zero_std": 0.625, "grad_norm": 0.7165817310212169, "kl": 0.5654296875, "learning_rate": 7.105323623186595e-06, "loss": 0.0057, "num_tokens": 673261913.0, "reward": 0.0008544921875, "reward_std": 0.0031085254158824682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0068359375, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 1622 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6350075805741674, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3412975228685002, "kl": 0.5224609375, "learning_rate": 7.0922520051110775e-06, "loss": 0.0052, "num_tokens": 673815737.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1623 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6353988360150633, "frac_reward_zero_std": 0.6875, "grad_norm": 0.34593441156350124, "kl": 0.4833984375, "learning_rate": 7.079185812048403e-06, "loss": 0.0048, "num_tokens": 674369225.0, "reward": 0.000732421875, "reward_std": 0.0026202441658824682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 1624 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6357900914559593, "frac_reward_zero_std": 0.875, "grad_norm": 0.33690349470789827, "kl": 0.464111328125, "learning_rate": 7.066125068376297e-06, "loss": 0.0046, "num_tokens": 674922601.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1625 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6361813468968552, "frac_reward_zero_std": 0.9375, "grad_norm": 0.21497790284370322, "kl": 0.454345703125, "learning_rate": 7.053069798462303e-06, "loss": 0.0045, "num_tokens": 675475449.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1626 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6365726023377513, "frac_reward_zero_std": 0.9375, "grad_norm": 0.21565035716715805, "kl": 0.429443359375, "learning_rate": 7.040020026663767e-06, "loss": 0.0043, "num_tokens": 676029289.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1627 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6369638577786473, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2313844238570309, "kl": 0.400390625, "learning_rate": 7.026975777327769e-06, "loss": 0.004, "num_tokens": 676581353.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1628 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6373551132195432, "frac_reward_zero_std": 1.0, "grad_norm": 0.7437064865912315, "kl": 0.60986328125, "learning_rate": 7.0139370747910884e-06, "loss": 0.0061, "num_tokens": 677135561.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1629 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6377463686604392, "frac_reward_zero_std": 1.0, "grad_norm": 0.1525219435988304, "kl": 0.387451171875, "learning_rate": 7.000903943380159e-06, "loss": 0.0039, "num_tokens": 677688537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1630 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6381376241013351, "frac_reward_zero_std": 0.9375, "grad_norm": 0.27713699859095076, "kl": 0.416259765625, "learning_rate": 6.987876407411012e-06, "loss": 0.0042, "num_tokens": 678241273.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1631 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6385288795422311, "frac_reward_zero_std": 0.9375, "grad_norm": 0.22181012774212877, "kl": 0.42041015625, "learning_rate": 6.974854491189243e-06, "loss": 0.0042, "num_tokens": 678794361.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1632 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6389201349831272, "frac_reward_zero_std": 0.8125, "grad_norm": 0.2631704554224055, "kl": 0.42578125, "learning_rate": 6.961838219009968e-06, "loss": 0.0043, "num_tokens": 679350521.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1633 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6393113904240231, "frac_reward_zero_std": 0.9375, "grad_norm": 0.27423930450758505, "kl": 0.431396484375, "learning_rate": 6.9488276151577685e-06, "loss": 0.0043, "num_tokens": 679908185.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1634 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6397026458649191, "frac_reward_zero_std": 0.875, "grad_norm": 0.21038481651143262, "kl": 0.438232421875, "learning_rate": 6.935822703906648e-06, "loss": 0.0044, "num_tokens": 680462153.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1635 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.640093901305815, "frac_reward_zero_std": 1.0, "grad_norm": 0.14215047237825107, "kl": 0.424072265625, "learning_rate": 6.922823509519996e-06, "loss": 0.0042, "num_tokens": 681015529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1636 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.640485156746711, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1736144346840176, "kl": 0.400634765625, "learning_rate": 6.909830056250527e-06, "loss": 0.004, "num_tokens": 681568633.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1637 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6408764121876069, "frac_reward_zero_std": 0.9375, "grad_norm": 0.16686651721384851, "kl": 0.373779296875, "learning_rate": 6.896842368340253e-06, "loss": 0.0037, "num_tokens": 682122425.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1638 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.641267667628503, "frac_reward_zero_std": 1.0, "grad_norm": 0.10703599179935579, "kl": 0.364013671875, "learning_rate": 6.883860470020422e-06, "loss": 0.0036, "num_tokens": 682676729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1639 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.641658923069399, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1652474431018238, "kl": 0.34716796875, "learning_rate": 6.8708843855114895e-06, "loss": 0.0035, "num_tokens": 683233145.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1640 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6420501785102949, "frac_reward_zero_std": 0.9375, "grad_norm": 0.17996386840440837, "kl": 0.346435546875, "learning_rate": 6.857914139023058e-06, "loss": 0.0035, "num_tokens": 683787609.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1641 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6424414339511909, "frac_reward_zero_std": 0.9375, "grad_norm": 0.3011321890622464, "kl": 0.340087890625, "learning_rate": 6.844949754753833e-06, "loss": 0.0034, "num_tokens": 684344137.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1642 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6428326893920868, "frac_reward_zero_std": 0.875, "grad_norm": 0.3125812694192165, "kl": 0.32373046875, "learning_rate": 6.831991256891592e-06, "loss": 0.0032, "num_tokens": 684897689.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1643 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6432239448329828, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1260141998544915, "kl": 0.319091796875, "learning_rate": 6.819038669613125e-06, "loss": 0.0032, "num_tokens": 685451993.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1644 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6436152002738789, "frac_reward_zero_std": 0.9375, "grad_norm": 0.303854318405297, "kl": 0.323974609375, "learning_rate": 6.8060920170842e-06, "loss": 0.0032, "num_tokens": 686006249.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1645 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6440064557147748, "frac_reward_zero_std": 0.8125, "grad_norm": 0.31836668945624386, "kl": 0.329345703125, "learning_rate": 6.793151323459506e-06, "loss": 0.0033, "num_tokens": 686561193.0, "reward": 0.00048828125, "reward_std": 0.0016436816658824682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 1646 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6443977111556708, "frac_reward_zero_std": 0.6875, "grad_norm": 0.3262990294357117, "kl": 0.343017578125, "learning_rate": 6.78021661288262e-06, "loss": 0.0034, "num_tokens": 687114665.0, "reward": 0.000732421875, "reward_std": 0.0026202441658824682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 1647 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6447889665965667, "frac_reward_zero_std": 0.5625, "grad_norm": 0.3834575448365499, "kl": 0.345947265625, "learning_rate": 6.767287909485947e-06, "loss": 0.0035, "num_tokens": 687669929.0, "reward": 0.0008544921875, "reward_std": 0.00341796875, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0068359375, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 1648 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6451802220374627, "frac_reward_zero_std": 0.5, "grad_norm": 0.41404602679430935, "kl": 0.35009765625, "learning_rate": 6.7543652373906966e-06, "loss": 0.0035, "num_tokens": 688224425.0, "reward": 0.001220703125, "reward_std": 0.0042639258317649364, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.009765625, "rewards/tag_count_reward/std": 0.048530805855989456, "step": 1649 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6455714774783586, "frac_reward_zero_std": 0.375, "grad_norm": 0.5922575828306605, "kl": 0.35986328125, "learning_rate": 6.741448620706816e-06, "loss": 0.0036, "num_tokens": 688778233.0, "reward": 0.002197265625, "reward_std": 0.006254888605326414, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.017578125, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 1650 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6459627329192547, "frac_reward_zero_std": 0.1875, "grad_norm": 1.0150660122290072, "kl": 0.404541015625, "learning_rate": 6.728538083532961e-06, "loss": 0.004, "num_tokens": 689332345.0, "reward": 0.0035400390625, "reward_std": 0.00888185016810894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0283203125, "rewards/tag_count_reward/std": 0.07938928157091141, "step": 1651 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6463539883601507, "frac_reward_zero_std": 0.0, "grad_norm": 0.9266230706904591, "kl": 0.423828125, "learning_rate": 6.715633649956444e-06, "loss": 0.0042, "num_tokens": 689885481.0, "reward": 0.0048828125, "reward_std": 0.01111338660120964, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0390625, "rewards/tag_count_reward/std": 0.0909508615732193, "step": 1652 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6467452438010466, "frac_reward_zero_std": 0.0, "grad_norm": 2.1710241203580787, "kl": 0.469482421875, "learning_rate": 6.702735344053187e-06, "loss": 0.0047, "num_tokens": 690438473.0, "reward": 0.0084228515625, "reward_std": 0.013765223324298859, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0673828125, "rewards/tag_count_reward/std": 0.11114637553691864, "step": 1653 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6471364992419426, "frac_reward_zero_std": 0.0, "grad_norm": 27.593614452481248, "kl": 0.60205078125, "learning_rate": 6.68984318988768e-06, "loss": 0.006, "num_tokens": 690992649.0, "reward": 0.0106201171875, "reward_std": 0.014640686102211475, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0849609375, "rewards/tag_count_reward/std": 0.11864595115184784, "step": 1654 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6475277546828385, "frac_reward_zero_std": 0.5625, "grad_norm": 54.18738449773052, "kl": 0.71337890625, "learning_rate": 6.676957211512936e-06, "loss": 0.0071, "num_tokens": 691547273.0, "reward": 0.00146484375, "reward_std": 0.004253530874848366, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01171875, "rewards/tag_count_reward/std": 0.05294628441333771, "step": 1655 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6479190101237345, "frac_reward_zero_std": 0.8125, "grad_norm": 27.008512805806912, "kl": 0.7451171875, "learning_rate": 6.664077432970446e-06, "loss": 0.0075, "num_tokens": 692100377.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1656 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6483102655646306, "frac_reward_zero_std": 0.75, "grad_norm": 30.292608954678503, "kl": 0.74560546875, "learning_rate": 6.651203878290139e-06, "loss": 0.0075, "num_tokens": 692654217.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 1657 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6487015210055265, "frac_reward_zero_std": 0.75, "grad_norm": 0.7111400039802233, "kl": 0.7705078125, "learning_rate": 6.638336571490325e-06, "loss": 0.0077, "num_tokens": 693206937.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 1658 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6490927764464225, "frac_reward_zero_std": 0.5, "grad_norm": 0.754145221510217, "kl": 0.71923828125, "learning_rate": 6.625475536577655e-06, "loss": 0.0072, "num_tokens": 693759753.0, "reward": 0.0010986328125, "reward_std": 0.004085088148713112, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 1659 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6494840318873184, "frac_reward_zero_std": 0.125, "grad_norm": 1.011889128831703, "kl": 0.71142578125, "learning_rate": 6.612620797547087e-06, "loss": 0.0071, "num_tokens": 694313129.0, "reward": 0.0054931640625, "reward_std": 0.010913610458374023, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0439453125, "rewards/tag_count_reward/std": 0.0953448936343193, "step": 1660 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6498752873282144, "frac_reward_zero_std": 0.75, "grad_norm": 0.7487065377824011, "kl": 0.6015625, "learning_rate": 6.59977237838183e-06, "loss": 0.006, "num_tokens": 694866249.0, "reward": 0.000732421875, "reward_std": 0.0022521736100316048, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.005859375, "rewards/tag_count_reward/std": 0.03789619356393814, "step": 1661 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6502665427691103, "frac_reward_zero_std": 0.9375, "grad_norm": 0.4694988034429469, "kl": 0.57080078125, "learning_rate": 6.586930303053297e-06, "loss": 0.0057, "num_tokens": 695418505.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1662 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6506577982100064, "frac_reward_zero_std": 0.9375, "grad_norm": 1.4418296843880782, "kl": 0.81689453125, "learning_rate": 6.574094595521072e-06, "loss": 0.0082, "num_tokens": 695969897.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1663 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6510490536509024, "frac_reward_zero_std": 1.0, "grad_norm": 0.5308459030731489, "kl": 0.55322265625, "learning_rate": 6.561265279732858e-06, "loss": 0.0055, "num_tokens": 696527049.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1664 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6514403090917983, "frac_reward_zero_std": 1.0, "grad_norm": 0.23410247196260175, "kl": 0.5556640625, "learning_rate": 6.548442379624425e-06, "loss": 0.0056, "num_tokens": 697082649.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1665 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6518315645326943, "frac_reward_zero_std": 1.0, "grad_norm": 0.3849494273664346, "kl": 0.62646484375, "learning_rate": 6.535625919119579e-06, "loss": 0.0063, "num_tokens": 697637657.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1666 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6522228199735902, "frac_reward_zero_std": 1.0, "grad_norm": 0.25718743440281666, "kl": 0.56787109375, "learning_rate": 6.522815922130112e-06, "loss": 0.0057, "num_tokens": 698191529.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1667 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6526140754144862, "frac_reward_zero_std": 1.0, "grad_norm": 0.23472719833073366, "kl": 0.52587890625, "learning_rate": 6.5100124125557596e-06, "loss": 0.0053, "num_tokens": 698744761.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1668 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6530053308553823, "frac_reward_zero_std": 1.0, "grad_norm": 0.18688277184766844, "kl": 0.491455078125, "learning_rate": 6.497215414284146e-06, "loss": 0.0049, "num_tokens": 699297609.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1669 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6533965862962782, "frac_reward_zero_std": 1.0, "grad_norm": 0.16893191409331665, "kl": 0.447021484375, "learning_rate": 6.4844249511907574e-06, "loss": 0.0045, "num_tokens": 699852985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1670 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6537878417371742, "frac_reward_zero_std": 1.0, "grad_norm": 0.14392895314641613, "kl": 0.394775390625, "learning_rate": 6.471641047138874e-06, "loss": 0.004, "num_tokens": 700410313.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1671 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6541790971780701, "frac_reward_zero_std": 1.0, "grad_norm": 0.15234715059595225, "kl": 0.375244140625, "learning_rate": 6.458863725979549e-06, "loss": 0.0038, "num_tokens": 700963817.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1672 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6545703526189661, "frac_reward_zero_std": 1.0, "grad_norm": 0.11895314976085412, "kl": 0.336181640625, "learning_rate": 6.446093011551551e-06, "loss": 0.0034, "num_tokens": 701516121.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1673 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.654961608059862, "frac_reward_zero_std": 1.0, "grad_norm": 0.10453934307244189, "kl": 0.312255859375, "learning_rate": 6.433328927681322e-06, "loss": 0.0031, "num_tokens": 702068409.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1674 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.655352863500758, "frac_reward_zero_std": 1.0, "grad_norm": 0.09488943381634872, "kl": 0.29541015625, "learning_rate": 6.420571498182939e-06, "loss": 0.003, "num_tokens": 702622041.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1675 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6557441189416541, "frac_reward_zero_std": 1.0, "grad_norm": 0.0845979668503392, "kl": 0.279052734375, "learning_rate": 6.4078207468580515e-06, "loss": 0.0028, "num_tokens": 703177193.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1676 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.65613537438255, "frac_reward_zero_std": 1.0, "grad_norm": 0.08186187007236481, "kl": 0.26708984375, "learning_rate": 6.395076697495854e-06, "loss": 0.0027, "num_tokens": 703730985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1677 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.656526629823446, "frac_reward_zero_std": 1.0, "grad_norm": 0.08661156690732119, "kl": 0.26318359375, "learning_rate": 6.3823393738730465e-06, "loss": 0.0026, "num_tokens": 704284265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1678 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6569178852643419, "frac_reward_zero_std": 1.0, "grad_norm": 0.10623550850983556, "kl": 0.262939453125, "learning_rate": 6.369608799753772e-06, "loss": 0.0026, "num_tokens": 704837017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1679 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6573091407052379, "frac_reward_zero_std": 1.0, "grad_norm": 0.08599770544303449, "kl": 0.25537109375, "learning_rate": 6.356884998889582e-06, "loss": 0.0026, "num_tokens": 705390953.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1680 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.657700396146134, "frac_reward_zero_std": 1.0, "grad_norm": 0.09327028333281374, "kl": 0.25439453125, "learning_rate": 6.344167995019395e-06, "loss": 0.0025, "num_tokens": 705944985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1681 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6580916515870299, "frac_reward_zero_std": 1.0, "grad_norm": 0.15582650761810368, "kl": 0.276611328125, "learning_rate": 6.331457811869437e-06, "loss": 0.0028, "num_tokens": 706497769.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1682 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6584829070279259, "frac_reward_zero_std": 1.0, "grad_norm": 0.26002581201944036, "kl": 0.294921875, "learning_rate": 6.318754473153221e-06, "loss": 0.0029, "num_tokens": 707053321.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1683 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6588741624688218, "frac_reward_zero_std": 1.0, "grad_norm": 0.08943056490571624, "kl": 0.27880859375, "learning_rate": 6.306058002571488e-06, "loss": 0.0028, "num_tokens": 707606777.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1684 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6592654179097178, "frac_reward_zero_std": 1.0, "grad_norm": 0.07045741801513182, "kl": 0.28564453125, "learning_rate": 6.29336842381216e-06, "loss": 0.0029, "num_tokens": 708158969.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1685 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6596566733506137, "frac_reward_zero_std": 1.0, "grad_norm": 0.09546214355027303, "kl": 0.299560546875, "learning_rate": 6.280685760550303e-06, "loss": 0.003, "num_tokens": 708713177.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1686 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6600479287915098, "frac_reward_zero_std": 1.0, "grad_norm": 0.1034699862153152, "kl": 0.326171875, "learning_rate": 6.2680100364480876e-06, "loss": 0.0033, "num_tokens": 709266889.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1687 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6604391842324058, "frac_reward_zero_std": 1.0, "grad_norm": 0.1033729659603444, "kl": 0.33642578125, "learning_rate": 6.255341275154725e-06, "loss": 0.0034, "num_tokens": 709821001.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1688 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6608304396733017, "frac_reward_zero_std": 1.0, "grad_norm": 0.09488830832779412, "kl": 0.349609375, "learning_rate": 6.242679500306443e-06, "loss": 0.0035, "num_tokens": 710373145.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1689 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6612216951141977, "frac_reward_zero_std": 1.0, "grad_norm": 0.10301532548062393, "kl": 0.366455078125, "learning_rate": 6.230024735526436e-06, "loss": 0.0037, "num_tokens": 710927497.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1690 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6616129505550936, "frac_reward_zero_std": 1.0, "grad_norm": 0.08624210990634783, "kl": 0.36767578125, "learning_rate": 6.217377004424819e-06, "loss": 0.0037, "num_tokens": 711482537.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1691 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6620042059959896, "frac_reward_zero_std": 1.0, "grad_norm": 0.0799863536559895, "kl": 0.384033203125, "learning_rate": 6.204736330598585e-06, "loss": 0.0038, "num_tokens": 712037385.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1692 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6623954614368857, "frac_reward_zero_std": 1.0, "grad_norm": 0.07689753142097716, "kl": 0.386474609375, "learning_rate": 6.192102737631552e-06, "loss": 0.0039, "num_tokens": 712589897.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1693 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6627867168777816, "frac_reward_zero_std": 1.0, "grad_norm": 0.08488893381360861, "kl": 0.394775390625, "learning_rate": 6.179476249094336e-06, "loss": 0.0039, "num_tokens": 713143417.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1694 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6631779723186776, "frac_reward_zero_std": 1.0, "grad_norm": 0.07904574749589661, "kl": 0.393310546875, "learning_rate": 6.166856888544297e-06, "loss": 0.0039, "num_tokens": 713696729.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1695 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6635692277595735, "frac_reward_zero_std": 1.0, "grad_norm": 0.07548309844310998, "kl": 0.3974609375, "learning_rate": 6.154244679525494e-06, "loss": 0.004, "num_tokens": 714249433.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1696 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6639604832004695, "frac_reward_zero_std": 1.0, "grad_norm": 0.11522242804451076, "kl": 0.3857421875, "learning_rate": 6.141639645568646e-06, "loss": 0.0039, "num_tokens": 714806265.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1697 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6643517386413654, "frac_reward_zero_std": 0.9375, "grad_norm": 0.16816581418846419, "kl": 0.39111328125, "learning_rate": 6.129041810191085e-06, "loss": 0.0039, "num_tokens": 715359417.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1698 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6647429940822615, "frac_reward_zero_std": 0.9375, "grad_norm": 0.22576761963436967, "kl": 0.3935546875, "learning_rate": 6.1164511968967066e-06, "loss": 0.0039, "num_tokens": 715912761.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1699 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6651342495231575, "frac_reward_zero_std": 1.0, "grad_norm": 0.11263259110577065, "kl": 0.3857421875, "learning_rate": 6.103867829175936e-06, "loss": 0.0039, "num_tokens": 716465545.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1700 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6655255049640534, "frac_reward_zero_std": 1.0, "grad_norm": 0.07820071677769684, "kl": 0.37255859375, "learning_rate": 6.091291730505684e-06, "loss": 0.0037, "num_tokens": 717018681.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1701 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6659167604049494, "frac_reward_zero_std": 1.0, "grad_norm": 0.07506174819712189, "kl": 0.364501953125, "learning_rate": 6.0787229243493e-06, "loss": 0.0036, "num_tokens": 717573753.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1702 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6663080158458453, "frac_reward_zero_std": 1.0, "grad_norm": 0.08534571528380666, "kl": 0.370849609375, "learning_rate": 6.066161434156521e-06, "loss": 0.0037, "num_tokens": 718127977.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1703 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6666992712867413, "frac_reward_zero_std": 1.0, "grad_norm": 0.07240557906277918, "kl": 0.369873046875, "learning_rate": 6.0536072833634316e-06, "loss": 0.0037, "num_tokens": 718681017.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1704 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6670905267276374, "frac_reward_zero_std": 1.0, "grad_norm": 0.06622868218095848, "kl": 0.37841796875, "learning_rate": 6.041060495392437e-06, "loss": 0.0038, "num_tokens": 719234521.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1705 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6674817821685333, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1802526686633663, "kl": 0.379150390625, "learning_rate": 6.028521093652195e-06, "loss": 0.0038, "num_tokens": 719790457.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1706 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6678730376094293, "frac_reward_zero_std": 1.0, "grad_norm": 0.09148537184976376, "kl": 0.37890625, "learning_rate": 6.015989101537586e-06, "loss": 0.0038, "num_tokens": 720344985.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1707 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6682642930503252, "frac_reward_zero_std": 0.9375, "grad_norm": 0.19839692876374218, "kl": 0.380859375, "learning_rate": 6.003464542429666e-06, "loss": 0.0038, "num_tokens": 720900297.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1708 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6686555484912212, "frac_reward_zero_std": 0.9375, "grad_norm": 0.1784264537266357, "kl": 0.396240234375, "learning_rate": 5.9909474396956245e-06, "loss": 0.004, "num_tokens": 721453625.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1709 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6690468039321171, "frac_reward_zero_std": 0.9375, "grad_norm": 0.19493409281761037, "kl": 0.38720703125, "learning_rate": 5.9784378166887345e-06, "loss": 0.0039, "num_tokens": 722007737.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1710 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6694380593730132, "frac_reward_zero_std": 0.875, "grad_norm": 0.2623245816121728, "kl": 0.391845703125, "learning_rate": 5.965935696748322e-06, "loss": 0.0039, "num_tokens": 722561081.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1711 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6698293148139092, "frac_reward_zero_std": 0.9375, "grad_norm": 0.2019006060021691, "kl": 0.37646484375, "learning_rate": 5.953441103199704e-06, "loss": 0.0038, "num_tokens": 723115577.0, "reward": 0.0001220703125, "reward_std": 0.00048828125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0009765625, "rewards/tag_count_reward/std": 0.015625, "step": 1712 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6702205702548051, "frac_reward_zero_std": 0.8125, "grad_norm": 0.36616612223408257, "kl": 0.380126953125, "learning_rate": 5.940954059354165e-06, "loss": 0.0038, "num_tokens": 723671081.0, "reward": 0.0003662109375, "reward_std": 0.00146484375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0029296875, "rewards/tag_count_reward/std": 0.02695695497095585, "step": 1713 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6706118256957011, "frac_reward_zero_std": 1.0, "grad_norm": 0.0754224302555887, "kl": 0.384521484375, "learning_rate": 5.928474588508903e-06, "loss": 0.0038, "num_tokens": 724224361.0, "reward": 0.0, "reward_std": 0.0, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0, "rewards/tag_count_reward/std": 0.0, "step": 1714 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.671003081136597, "frac_reward_zero_std": 0.875, "grad_norm": 0.3064231643350804, "kl": 0.386474609375, "learning_rate": 5.9160027139469834e-06, "loss": 0.0039, "num_tokens": 724775353.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1715 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.671394336577493, "frac_reward_zero_std": 0.875, "grad_norm": 0.2882599805901264, "kl": 0.388427734375, "learning_rate": 5.903538458937294e-06, "loss": 0.0039, "num_tokens": 725329465.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1716 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.671785592018389, "frac_reward_zero_std": 0.875, "grad_norm": 0.2739868950289847, "kl": 0.375732421875, "learning_rate": 5.891081846734519e-06, "loss": 0.0038, "num_tokens": 725884217.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1717 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.672176847459285, "frac_reward_zero_std": 0.6875, "grad_norm": 0.4033959378664107, "kl": 0.38232421875, "learning_rate": 5.878632900579077e-06, "loss": 0.0038, "num_tokens": 726436793.0, "reward": 0.0006103515625, "reward_std": 0.00244140625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0048828125, "rewards/tag_count_reward/std": 0.034663453698158264, "step": 1718 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.672568102900181, "frac_reward_zero_std": 0.875, "grad_norm": 0.3064786585910867, "kl": 0.385986328125, "learning_rate": 5.866191643697079e-06, "loss": 0.0039, "num_tokens": 726991241.0, "reward": 0.000244140625, "reward_std": 0.0009765625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.001953125, "rewards/tag_count_reward/std": 0.022053716704249382, "step": 1719 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6729593583410769, "frac_reward_zero_std": 0.75, "grad_norm": 0.3862136630646623, "kl": 0.3828125, "learning_rate": 5.853758099300305e-06, "loss": 0.0038, "num_tokens": 727545689.0, "reward": 0.00048828125, "reward_std": 0.001953125, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.00390625, "rewards/tag_count_reward/std": 0.03106563352048397, "step": 1720 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6733506137819729, "frac_reward_zero_std": 0.5, "grad_norm": 0.6346132399453885, "kl": 0.388916015625, "learning_rate": 5.841332290586126e-06, "loss": 0.0039, "num_tokens": 728099337.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0078125, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 1721 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6737418692228688, "frac_reward_zero_std": 0.4375, "grad_norm": 0.669896035642243, "kl": 0.387451171875, "learning_rate": 5.828914240737496e-06, "loss": 0.0039, "num_tokens": 728655449.0, "reward": 0.0018310546875, "reward_std": 0.0052576009184122086, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0146484375, "rewards/tag_count_reward/std": 0.058830711990594864, "step": 1722 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6741331246637648, "frac_reward_zero_std": 0.125, "grad_norm": 0.9011688817305824, "kl": 0.4189453125, "learning_rate": 5.816503972922885e-06, "loss": 0.0042, "num_tokens": 729211769.0, "reward": 0.00390625, "reward_std": 0.009400473907589912, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.03125, "rewards/tag_count_reward/std": 0.08284168690443039, "step": 1723 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6745243801046609, "frac_reward_zero_std": 0.0, "grad_norm": 0.9704286199131417, "kl": 0.4609375, "learning_rate": 5.804101510296245e-06, "loss": 0.0046, "num_tokens": 729766073.0, "reward": 0.0059814453125, "reward_std": 0.012312415987253189, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0478515625, "rewards/tag_count_reward/std": 0.09854467958211899, "step": 1724 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6749156355455568, "frac_reward_zero_std": 0.0, "grad_norm": 1.255949328808867, "kl": 0.49462890625, "learning_rate": 5.791706875996974e-06, "loss": 0.005, "num_tokens": 730319705.0, "reward": 0.0106201171875, "reward_std": 0.014958464540541172, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0849609375, "rewards/tag_count_reward/std": 0.11864595115184784, "step": 1725 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6753068909864528, "frac_reward_zero_std": 0.0, "grad_norm": 1.1571314205736658, "kl": 0.61328125, "learning_rate": 5.779320093149855e-06, "loss": 0.0061, "num_tokens": 730874825.0, "reward": 0.0084228515625, "reward_std": 0.013375960290431976, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0673828125, "rewards/tag_count_reward/std": 0.11114637553691864, "step": 1726 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6756981464273487, "frac_reward_zero_std": 0.0, "grad_norm": 2.6057769080747013, "kl": 0.6904296875, "learning_rate": 5.766941184865024e-06, "loss": 0.0069, "num_tokens": 731431945.0, "reward": 0.0079345703125, "reward_std": 0.013283168897032738, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0634765625, "rewards/tag_count_reward/std": 0.1090243011713028, "step": 1727 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6760894018682447, "frac_reward_zero_std": 0.0625, "grad_norm": 1.2402423688642952, "kl": 0.67431640625, "learning_rate": 5.754570174237928e-06, "loss": 0.0067, "num_tokens": 731988281.0, "reward": 0.0067138671875, "reward_std": 0.012113664299249649, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0537109375, "rewards/tag_count_reward/std": 0.1052350252866745, "step": 1728 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6764806573091408, "frac_reward_zero_std": 0.0, "grad_norm": 1.4274928080355824, "kl": 0.6455078125, "learning_rate": 5.742207084349274e-06, "loss": 0.0065, "num_tokens": 732541417.0, "reward": 0.0106201171875, "reward_std": 0.01479667890816927, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0849609375, "rewards/tag_count_reward/std": 0.11864595115184784, "step": 1729 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6768719127500367, "frac_reward_zero_std": 0.0, "grad_norm": 1.7971737803037768, "kl": 0.6328125, "learning_rate": 5.729851938265002e-06, "loss": 0.0063, "num_tokens": 733095897.0, "reward": 0.0113525390625, "reward_std": 0.01496095396578312, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0908203125, "rewards/tag_count_reward/std": 0.12047175318002701, "step": 1730 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6772631681909327, "frac_reward_zero_std": 0.0, "grad_norm": 1.1040094297247176, "kl": 0.58935546875, "learning_rate": 5.717504759036222e-06, "loss": 0.0059, "num_tokens": 733650329.0, "reward": 0.00830078125, "reward_std": 0.013769853860139847, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.06640625, "rewards/tag_count_reward/std": 0.11063265055418015, "step": 1731 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6776544236318286, "frac_reward_zero_std": 0.0625, "grad_norm": 1.1222565123552917, "kl": 0.533203125, "learning_rate": 5.7051655696991825e-06, "loss": 0.0053, "num_tokens": 734203641.0, "reward": 0.0052490234375, "reward_std": 0.011281829327344894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0419921875, "rewards/tag_count_reward/std": 0.0936427116394043, "step": 1732 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6780456790727246, "frac_reward_zero_std": 0.3125, "grad_norm": 4.031229322966141, "kl": 0.499267578125, "learning_rate": 5.692834393275226e-06, "loss": 0.005, "num_tokens": 734758409.0, "reward": 0.002197265625, "reward_std": 0.006413000635802746, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.017578125, "rewards/tag_count_reward/std": 0.06404344737529755, "step": 1733 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6784369345136205, "frac_reward_zero_std": 0.375, "grad_norm": 0.5751583518924732, "kl": 0.4609375, "learning_rate": 5.68051125277074e-06, "loss": 0.0046, "num_tokens": 735310473.0, "reward": 0.001953125, "reward_std": 0.005779958330094814, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.060633908957242966, "step": 1734 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6788281899545165, "frac_reward_zero_std": 0.5, "grad_norm": 0.5907267487131251, "kl": 0.42236328125, "learning_rate": 5.668196171177129e-06, "loss": 0.0042, "num_tokens": 735863577.0, "reward": 0.00146484375, "reward_std": 0.004621601663529873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.01171875, "rewards/tag_count_reward/std": 0.05294628441333771, "step": 1735 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6792194453954126, "frac_reward_zero_std": 0.4375, "grad_norm": 0.4827371447786795, "kl": 0.411865234375, "learning_rate": 5.655889171470759e-06, "loss": 0.0041, "num_tokens": 736416265.0, "reward": 0.0015869140625, "reward_std": 0.004958552308380604, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0126953125, "rewards/tag_count_reward/std": 0.05499519780278206, "step": 1736 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6796107008363085, "frac_reward_zero_std": 0.5625, "grad_norm": 0.42959412052175333, "kl": 0.407470703125, "learning_rate": 5.643590276612909e-06, "loss": 0.0041, "num_tokens": 736970361.0, "reward": 0.0010986328125, "reward_std": 0.0037756445817649364, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 1737 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6800019562772045, "frac_reward_zero_std": 0.5625, "grad_norm": 0.42064226265815785, "kl": 0.39013671875, "learning_rate": 5.631299509549748e-06, "loss": 0.0039, "num_tokens": 737526057.0, "reward": 0.0010986328125, "reward_std": 0.0037756445817649364, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0087890625, "rewards/tag_count_reward/std": 0.046133846044540405, "step": 1738 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6803932117181004, "frac_reward_zero_std": 0.5, "grad_norm": 0.6969162910795687, "kl": 0.382568359375, "learning_rate": 5.619016893212273e-06, "loss": 0.0038, "num_tokens": 738079305.0, "reward": 0.0009765625, "reward_std": 0.00390625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0078125, "rewards/tag_count_reward/std": 0.04358336701989174, "step": 1739 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6807844671589964, "frac_reward_zero_std": 0.6875, "grad_norm": 1.1670057763859203, "kl": 0.417236328125, "learning_rate": 5.606742450516275e-06, "loss": 0.0042, "num_tokens": 738634905.0, "reward": 0.0008544921875, "reward_std": 0.0027990820817649364, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0068359375, "rewards/tag_count_reward/std": 0.040850620716810226, "step": 1740 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6811757225998925, "frac_reward_zero_std": 0.375, "grad_norm": 0.7290769338213176, "kl": 0.39013671875, "learning_rate": 5.594476204362303e-06, "loss": 0.0039, "num_tokens": 739188425.0, "reward": 0.001953125, "reward_std": 0.005897212773561478, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.015625, "rewards/tag_count_reward/std": 0.060633908957242966, "step": 1741 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6815669780407884, "frac_reward_zero_std": 0.1875, "grad_norm": 0.6720273664710892, "kl": 0.391357421875, "learning_rate": 5.582218177635607e-06, "loss": 0.0039, "num_tokens": 739742857.0, "reward": 0.00341796875, "reward_std": 0.008493054658174515, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.02734375, "rewards/tag_count_reward/std": 0.07818012684583664, "step": 1742 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6819582334816844, "frac_reward_zero_std": 0.0625, "grad_norm": 0.689210600463414, "kl": 0.420166015625, "learning_rate": 5.5699683932061e-06, "loss": 0.0042, "num_tokens": 740294969.0, "reward": 0.007568359375, "reward_std": 0.012755883857607841, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.060546875, "rewards/tag_count_reward/std": 0.10731159895658493, "step": 1743 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6823494889225803, "frac_reward_zero_std": 0.0, "grad_norm": 0.8030300095473518, "kl": 0.461669921875, "learning_rate": 5.557726873928323e-06, "loss": 0.0046, "num_tokens": 740848569.0, "reward": 0.011962890625, "reward_std": 0.015016300603747368, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.095703125, "rewards/tag_count_reward/std": 0.12175632268190384, "step": 1744 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6827407443634763, "frac_reward_zero_std": 0.0, "grad_norm": 0.9044070668230464, "kl": 0.51171875, "learning_rate": 5.545493642641389e-06, "loss": 0.0051, "num_tokens": 741404201.0, "reward": 0.0089111328125, "reward_std": 0.013933210633695126, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0712890625, "rewards/tag_count_reward/std": 0.11524015665054321, "step": 1745 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6831319998043722, "frac_reward_zero_std": 0.0625, "grad_norm": 1.059501182083293, "kl": 0.54541015625, "learning_rate": 5.533268722168959e-06, "loss": 0.0054, "num_tokens": 741958153.0, "reward": 0.007568359375, "reward_std": 0.012747085653245449, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.060546875, "rewards/tag_count_reward/std": 0.10731159895658493, "step": 1746 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6835232552452682, "frac_reward_zero_std": 0.0, "grad_norm": 1.9839284620913207, "kl": 0.5400390625, "learning_rate": 5.521052135319182e-06, "loss": 0.0054, "num_tokens": 742511129.0, "reward": 0.0098876953125, "reward_std": 0.014368297532200813, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0791015625, "rewards/tag_count_reward/std": 0.11649612337350845, "step": 1747 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6839145106861643, "frac_reward_zero_std": 0.0, "grad_norm": 0.8031940481628045, "kl": 0.5068359375, "learning_rate": 5.508843904884652e-06, "loss": 0.0051, "num_tokens": 743062489.0, "reward": 0.01220703125, "reward_std": 0.015215273946523666, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.09765625, "rewards/tag_count_reward/std": 0.12420088052749634, "step": 1748 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6843057661270602, "frac_reward_zero_std": 0.0, "grad_norm": 0.9660719699758054, "kl": 0.4892578125, "learning_rate": 5.496644053642394e-06, "loss": 0.0049, "num_tokens": 743614329.0, "reward": 0.01220703125, "reward_std": 0.015136253088712692, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.09765625, "rewards/tag_count_reward/std": 0.12420088052749634, "step": 1749 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6846970215679562, "frac_reward_zero_std": 0.0, "grad_norm": 6.194746331322342, "kl": 0.4453125, "learning_rate": 5.484452604353772e-06, "loss": 0.0044, "num_tokens": 744166825.0, "reward": 0.0089111328125, "reward_std": 0.013925764709711075, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0712890625, "rewards/tag_count_reward/std": 0.1130933091044426, "step": 1750 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6850882770088521, "frac_reward_zero_std": 0.0, "grad_norm": 0.5895548799967114, "kl": 0.435546875, "learning_rate": 5.472269579764486e-06, "loss": 0.0044, "num_tokens": 744720937.0, "reward": 0.006591796875, "reward_std": 0.012839436531066895, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.052734375, "rewards/tag_count_reward/std": 0.10219332575798035, "step": 1751 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6854795324497481, "frac_reward_zero_std": 0.0, "grad_norm": 0.7582215922744177, "kl": 0.436279296875, "learning_rate": 5.460095002604533e-06, "loss": 0.0044, "num_tokens": 745273081.0, "reward": 0.00830078125, "reward_std": 0.013743184506893158, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.06640625, "rewards/tag_count_reward/std": 0.11063265055418015, "step": 1752 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6858707878906442, "frac_reward_zero_std": 0.0, "grad_norm": 1.0744320580081412, "kl": 0.412841796875, "learning_rate": 5.447928895588128e-06, "loss": 0.0041, "num_tokens": 745829785.0, "reward": 0.0093994140625, "reward_std": 0.014487044885754585, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0751953125, "rewards/tag_count_reward/std": 0.11698818951845169, "step": 1753 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6862620433315401, "frac_reward_zero_std": 0.0, "grad_norm": 0.6151567704217022, "kl": 0.416748046875, "learning_rate": 5.4357712814136934e-06, "loss": 0.0042, "num_tokens": 746384489.0, "reward": 0.0101318359375, "reward_std": 0.014513584785163403, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0810546875, "rewards/tag_count_reward/std": 0.11932186782360077, "step": 1754 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6866532987724361, "frac_reward_zero_std": 0.0, "grad_norm": 0.5338077680215082, "kl": 0.431640625, "learning_rate": 5.423622182763805e-06, "loss": 0.0043, "num_tokens": 746937353.0, "reward": 0.0111083984375, "reward_std": 0.01490692887455225, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0888671875, "rewards/tag_count_reward/std": 0.11989818513393402, "step": 1755 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.687044554213332, "frac_reward_zero_std": 0.0, "grad_norm": 0.5151980899388467, "kl": 0.4443359375, "learning_rate": 5.411481622305145e-06, "loss": 0.0044, "num_tokens": 747490393.0, "reward": 0.010498046875, "reward_std": 0.014821207150816917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.083984375, "rewards/tag_count_reward/std": 0.1183105930685997, "step": 1756 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.687435809654228, "frac_reward_zero_std": 0.0, "grad_norm": 0.6411929524925547, "kl": 0.4599609375, "learning_rate": 5.399349622688479e-06, "loss": 0.0046, "num_tokens": 748043913.0, "reward": 0.0115966796875, "reward_std": 0.01503140851855278, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0927734375, "rewards/tag_count_reward/std": 0.12301970273256302, "step": 1757 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6878270650951239, "frac_reward_zero_std": 0.0, "grad_norm": 0.6147243485340492, "kl": 0.4775390625, "learning_rate": 5.387226206548592e-06, "loss": 0.0048, "num_tokens": 748595641.0, "reward": 0.0130615234375, "reward_std": 0.015611660666763783, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1044921875, "rewards/tag_count_reward/std": 0.12551595270633698, "step": 1758 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.68821832053602, "frac_reward_zero_std": 0.0, "grad_norm": 0.7005526691770428, "kl": 0.457763671875, "learning_rate": 5.37511139650425e-06, "loss": 0.0046, "num_tokens": 749151177.0, "reward": 0.0128173828125, "reward_std": 0.016169600188732147, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1025390625, "rewards/tag_count_reward/std": 0.12903639674186707, "step": 1759 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.688609575976916, "frac_reward_zero_std": 0.0, "grad_norm": 0.6615398920382781, "kl": 0.4580078125, "learning_rate": 5.36300521515818e-06, "loss": 0.0046, "num_tokens": 749708521.0, "reward": 0.0108642578125, "reward_std": 0.01515212282538414, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0869140625, "rewards/tag_count_reward/std": 0.12333061546087265, "step": 1760 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6890008314178119, "frac_reward_zero_std": 0.0, "grad_norm": 0.6135937285223642, "kl": 0.47314453125, "learning_rate": 5.350907685096983e-06, "loss": 0.0047, "num_tokens": 750263913.0, "reward": 0.0107421875, "reward_std": 0.015371473506093025, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0859375, "rewards/tag_count_reward/std": 0.125, "step": 1761 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6893920868587079, "frac_reward_zero_std": 0.0, "grad_norm": 0.5271994809612881, "kl": 0.47509765625, "learning_rate": 5.338818828891148e-06, "loss": 0.0048, "num_tokens": 750818809.0, "reward": 0.01025390625, "reward_std": 0.014661481603980064, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.08203125, "rewards/tag_count_reward/std": 0.12370654940605164, "step": 1762 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6897833422996038, "frac_reward_zero_std": 0.0, "grad_norm": 0.5552852540930762, "kl": 0.479248046875, "learning_rate": 5.3267386690949614e-06, "loss": 0.0048, "num_tokens": 751374121.0, "reward": 0.009765625, "reward_std": 0.014929408207535744, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.078125, "rewards/tag_count_reward/std": 0.12025300413370132, "step": 1763 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6901745977404998, "frac_reward_zero_std": 0.0, "grad_norm": 0.6218940645142025, "kl": 0.50244140625, "learning_rate": 5.314667228246489e-06, "loss": 0.005, "num_tokens": 751927433.0, "reward": 0.0111083984375, "reward_std": 0.015614369884133339, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0888671875, "rewards/tag_count_reward/std": 0.12588153779506683, "step": 1764 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6905658531813959, "frac_reward_zero_std": 0.0, "grad_norm": 0.6001372907541404, "kl": 0.51416015625, "learning_rate": 5.302604528867544e-06, "loss": 0.0051, "num_tokens": 752480041.0, "reward": 0.0128173828125, "reward_std": 0.015879416838288307, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1025390625, "rewards/tag_count_reward/std": 0.12712275981903076, "step": 1765 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6909571086222918, "frac_reward_zero_std": 0.0, "grad_norm": 0.6567663545593888, "kl": 0.51904296875, "learning_rate": 5.290550593463606e-06, "loss": 0.0052, "num_tokens": 753032905.0, "reward": 0.0130615234375, "reward_std": 0.015648551285266876, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1044921875, "rewards/tag_count_reward/std": 0.12551595270633698, "step": 1766 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6913483640631878, "frac_reward_zero_std": 0.0, "grad_norm": 0.5871544628548031, "kl": 0.50244140625, "learning_rate": 5.2785054445238156e-06, "loss": 0.005, "num_tokens": 753586793.0, "reward": 0.0123291015625, "reward_std": 0.01656537503004074, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0986328125, "rewards/tag_count_reward/std": 0.13205794990062714, "step": 1767 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6917396195040837, "frac_reward_zero_std": 0.0, "grad_norm": 0.7112015846337126, "kl": 0.49560546875, "learning_rate": 5.266469104520928e-06, "loss": 0.005, "num_tokens": 754141241.0, "reward": 0.0142822265625, "reward_std": 0.017948569729924202, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1142578125, "rewards/tag_count_reward/std": 0.14308175444602966, "step": 1768 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6921308749449797, "frac_reward_zero_std": 0.0, "grad_norm": 0.763929220242604, "kl": 0.489013671875, "learning_rate": 5.254441595911255e-06, "loss": 0.0049, "num_tokens": 754694985.0, "reward": 0.0125732421875, "reward_std": 0.01667136326432228, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1005859375, "rewards/tag_count_reward/std": 0.13427236676216125, "step": 1769 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6925221303858757, "frac_reward_zero_std": 0.0, "grad_norm": 0.5996710361732903, "kl": 0.476806640625, "learning_rate": 5.242422941134626e-06, "loss": 0.0048, "num_tokens": 755249561.0, "reward": 0.0133056640625, "reward_std": 0.016364119946956635, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1064453125, "rewards/tag_count_reward/std": 0.12965814769268036, "step": 1770 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6929133858267716, "frac_reward_zero_std": 0.0, "grad_norm": 0.6142420899322919, "kl": 0.4794921875, "learning_rate": 5.230413162614371e-06, "loss": 0.0048, "num_tokens": 755805017.0, "reward": 0.0086669921875, "reward_std": 0.014157634228467941, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0693359375, "rewards/tag_count_reward/std": 0.11430587619543076, "step": 1771 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6933046412676677, "frac_reward_zero_std": 0.0, "grad_norm": 0.6180204425755905, "kl": 0.49560546875, "learning_rate": 5.2184122827572315e-06, "loss": 0.005, "num_tokens": 756360873.0, "reward": 0.00927734375, "reward_std": 0.01485484093427658, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.07421875, "rewards/tag_count_reward/std": 0.12069803476333618, "step": 1772 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6936958967085636, "frac_reward_zero_std": 0.0, "grad_norm": 5.0862474438816045, "kl": 0.492431640625, "learning_rate": 5.206420323953374e-06, "loss": 0.0049, "num_tokens": 756913273.0, "reward": 0.01220703125, "reward_std": 0.016434166580438614, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.09765625, "rewards/tag_count_reward/std": 0.13552503287792206, "step": 1773 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6940871521494596, "frac_reward_zero_std": 0.0, "grad_norm": 0.6445881221104038, "kl": 0.497802734375, "learning_rate": 5.194437308576306e-06, "loss": 0.005, "num_tokens": 757468441.0, "reward": 0.0115966796875, "reward_std": 0.016086257994174957, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0927734375, "rewards/tag_count_reward/std": 0.12885820865631104, "step": 1774 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6944784075903555, "frac_reward_zero_std": 0.0, "grad_norm": 1.2411617441714278, "kl": 0.53271484375, "learning_rate": 5.1824632589828465e-06, "loss": 0.0053, "num_tokens": 758021673.0, "reward": 0.0123291015625, "reward_std": 0.016391003504395485, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.0986328125, "rewards/tag_count_reward/std": 0.13390107452869415, "step": 1775 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6948696630312515, "frac_reward_zero_std": 0.0, "grad_norm": 1.140809952578659, "kl": 0.52978515625, "learning_rate": 5.1704981975131e-06, "loss": 0.0053, "num_tokens": 758575401.0, "reward": 0.0133056640625, "reward_std": 0.017048444598913193, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1064453125, "rewards/tag_count_reward/std": 0.14054328203201294, "step": 1776 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6952609184721475, "frac_reward_zero_std": 0.0, "grad_norm": 1.7434767679173329, "kl": 0.53857421875, "learning_rate": 5.1585421464904e-06, "loss": 0.0054, "num_tokens": 759130761.0, "reward": 0.017822265625, "reward_std": 0.019538741558790207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.142578125, "rewards/tag_count_reward/std": 0.1602191925048828, "step": 1777 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6956521739130435, "frac_reward_zero_std": 0.0, "grad_norm": 0.835685371420558, "kl": 0.54736328125, "learning_rate": 5.146595128221246e-06, "loss": 0.0055, "num_tokens": 759682489.0, "reward": 0.012451171875, "reward_std": 0.01665002852678299, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.099609375, "rewards/tag_count_reward/std": 0.13590598106384277, "step": 1778 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6960434293539395, "frac_reward_zero_std": 0.0, "grad_norm": 0.6234607551884979, "kl": 0.5234375, "learning_rate": 5.134657164995315e-06, "loss": 0.0052, "num_tokens": 760237241.0, "reward": 0.0164794921875, "reward_std": 0.01938299834728241, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1318359375, "rewards/tag_count_reward/std": 0.15640617907047272, "step": 1779 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6964346847948354, "frac_reward_zero_std": 0.0, "grad_norm": 0.6205723198125577, "kl": 0.52978515625, "learning_rate": 5.122728279085376e-06, "loss": 0.0053, "num_tokens": 760790745.0, "reward": 0.0152587890625, "reward_std": 0.018431197851896286, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1220703125, "rewards/tag_count_reward/std": 0.1517583578824997, "step": 1780 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6968259402357314, "frac_reward_zero_std": 0.0, "grad_norm": 0.5897820884315331, "kl": 0.5869140625, "learning_rate": 5.110808492747258e-06, "loss": 0.0059, "num_tokens": 761346505.0, "reward": 0.018310546875, "reward_std": 0.018208783119916916, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.146484375, "rewards/tag_count_reward/std": 0.1486123651266098, "step": 1781 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6972171956766274, "frac_reward_zero_std": 0.0, "grad_norm": 0.6058956233892691, "kl": 0.5244140625, "learning_rate": 5.098897828219831e-06, "loss": 0.0053, "num_tokens": 761899769.0, "reward": 0.0157470703125, "reward_std": 0.018664289265871048, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1259765625, "rewards/tag_count_reward/std": 0.15497952699661255, "step": 1782 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6976084511175233, "frac_reward_zero_std": 0.0, "grad_norm": 0.662947697678598, "kl": 0.52587890625, "learning_rate": 5.0869963077249204e-06, "loss": 0.0053, "num_tokens": 762452777.0, "reward": 0.018798828125, "reward_std": 0.019473323598504066, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.150390625, "rewards/tag_count_reward/std": 0.15761657059192657, "step": 1783 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6979997065584194, "frac_reward_zero_std": 0.0, "grad_norm": 0.5967014061211123, "kl": 0.52880859375, "learning_rate": 5.075103953467314e-06, "loss": 0.0053, "num_tokens": 763006889.0, "reward": 0.012939453125, "reward_std": 0.01744392327964306, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.103515625, "rewards/tag_count_reward/std": 0.1435794085264206, "step": 1784 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6983909619993153, "frac_reward_zero_std": 0.0, "grad_norm": 0.6367079658426851, "kl": 0.54296875, "learning_rate": 5.063220787634686e-06, "loss": 0.0054, "num_tokens": 763560585.0, "reward": 0.0133056640625, "reward_std": 0.01788846217095852, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1064453125, "rewards/tag_count_reward/std": 0.14735399186611176, "step": 1785 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6987822174402113, "frac_reward_zero_std": 0.0, "grad_norm": 2.9290488468183504, "kl": 0.60791015625, "learning_rate": 5.051346832397569e-06, "loss": 0.0061, "num_tokens": 764116585.0, "reward": 0.0164794921875, "reward_std": 0.020289724692702293, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1318359375, "rewards/tag_count_reward/std": 0.1640544831752777, "step": 1786 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6991734728811072, "frac_reward_zero_std": 0.0, "grad_norm": 0.9467751838972496, "kl": 0.61572265625, "learning_rate": 5.0394821099093195e-06, "loss": 0.0062, "num_tokens": 764671881.0, "reward": 0.01806640625, "reward_std": 0.019356844946742058, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.14453125, "rewards/tag_count_reward/std": 0.155328169465065, "step": 1787 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6995647283220032, "frac_reward_zero_std": 0.0, "grad_norm": 0.8001575319821481, "kl": 0.6513671875, "learning_rate": 5.027626642306057e-06, "loss": 0.0065, "num_tokens": 765225465.0, "reward": 0.02197265625, "reward_std": 0.02082572877407074, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.1696576029062271, "step": 1788 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.6999559837628992, "frac_reward_zero_std": 0.0, "grad_norm": 1.1755628957119941, "kl": 0.68408203125, "learning_rate": 5.015780451706641e-06, "loss": 0.0068, "num_tokens": 765780953.0, "reward": 0.022216796875, "reward_std": 0.02125539816915989, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.17193348705768585, "step": 1789 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7003472392037952, "frac_reward_zero_std": 0.0, "grad_norm": 0.6857502006128733, "kl": 0.73486328125, "learning_rate": 5.003943560212621e-06, "loss": 0.0073, "num_tokens": 766335321.0, "reward": 0.0216064453125, "reward_std": 0.020116500556468964, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.16614213585853577, "step": 1790 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7007384946446912, "frac_reward_zero_std": 0.0, "grad_norm": 4.766349159801653, "kl": 0.73681640625, "learning_rate": 4.992115989908192e-06, "loss": 0.0074, "num_tokens": 766889289.0, "reward": 0.021728515625, "reward_std": 0.02069772779941559, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.16878631711006165, "step": 1791 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7011297500855871, "frac_reward_zero_std": 0.0, "grad_norm": 1.541854789543874, "kl": 0.74609375, "learning_rate": 4.980297762860171e-06, "loss": 0.0075, "num_tokens": 767442841.0, "reward": 0.0191650390625, "reward_std": 0.019564202055335045, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1533203125, "rewards/tag_count_reward/std": 0.1586669683456421, "step": 1792 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7015210055264831, "frac_reward_zero_std": 0.0, "grad_norm": 1.3256270912009585, "kl": 0.7509765625, "learning_rate": 4.9684889011179335e-06, "loss": 0.0075, "num_tokens": 767997625.0, "reward": 0.0223388671875, "reward_std": 0.020947953686118126, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.16874943673610687, "step": 1793 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7019122609673791, "frac_reward_zero_std": 0.0, "grad_norm": 1.2665972271401473, "kl": 0.7236328125, "learning_rate": 4.956689426713384e-06, "loss": 0.0072, "num_tokens": 768550505.0, "reward": 0.020263671875, "reward_std": 0.019533459097146988, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.162109375, "rewards/tag_count_reward/std": 0.15683713555335999, "step": 1794 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.702303516408275, "frac_reward_zero_std": 0.0, "grad_norm": 0.8075471680640696, "kl": 0.6669921875, "learning_rate": 4.94489936166091e-06, "loss": 0.0067, "num_tokens": 769105049.0, "reward": 0.02099609375, "reward_std": 0.02001585066318512, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16796875, "rewards/tag_count_reward/std": 0.1660066694021225, "step": 1795 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7026947718491711, "frac_reward_zero_std": 0.0, "grad_norm": 0.8185602567344133, "kl": 0.61376953125, "learning_rate": 4.93311872795735e-06, "loss": 0.0061, "num_tokens": 769658281.0, "reward": 0.0216064453125, "reward_std": 0.020645875483751297, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.16761088371276855, "step": 1796 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.703086027290067, "frac_reward_zero_std": 0.0, "grad_norm": 0.9934709452112971, "kl": 0.56005859375, "learning_rate": 4.921347547581939e-06, "loss": 0.0056, "num_tokens": 770211689.0, "reward": 0.0218505859375, "reward_std": 0.021011479198932648, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.16849961876869202, "step": 1797 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.703477282730963, "frac_reward_zero_std": 0.0, "grad_norm": 1.8907868737066706, "kl": 0.488525390625, "learning_rate": 4.909585842496287e-06, "loss": 0.0049, "num_tokens": 770766329.0, "reward": 0.022216796875, "reward_std": 0.02014349028468132, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.163156196475029, "step": 1798 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7038685381718589, "frac_reward_zero_std": 0.0, "grad_norm": 0.6679647610392255, "kl": 0.455810546875, "learning_rate": 4.897833634644313e-06, "loss": 0.0046, "num_tokens": 771320233.0, "reward": 0.02001953125, "reward_std": 0.019360551610589027, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16015625, "rewards/tag_count_reward/std": 0.15728822350502014, "step": 1799 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7042597936127549, "frac_reward_zero_std": 0.0, "grad_norm": 0.5613971989052382, "kl": 0.43359375, "learning_rate": 4.8860909459522245e-06, "loss": 0.0043, "num_tokens": 771873801.0, "reward": 0.01953125, "reward_std": 0.020866407081484795, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.1671561449766159, "step": 1800 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.704651049053651, "frac_reward_zero_std": 0.0, "grad_norm": 1.5527705933435143, "kl": 0.40087890625, "learning_rate": 4.874357798328464e-06, "loss": 0.004, "num_tokens": 772427081.0, "reward": 0.022216796875, "reward_std": 0.02180694043636322, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.17754410207271576, "step": 1801 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7050423044945469, "frac_reward_zero_std": 0.0, "grad_norm": 0.600801512565828, "kl": 0.382080078125, "learning_rate": 4.862634213663672e-06, "loss": 0.0038, "num_tokens": 772981417.0, "reward": 0.021240234375, "reward_std": 0.019869018346071243, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.169921875, "rewards/tag_count_reward/std": 0.15945248305797577, "step": 1802 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7054335599354429, "frac_reward_zero_std": 0.0, "grad_norm": 0.8284861506597977, "kl": 0.37646484375, "learning_rate": 4.850920213830659e-06, "loss": 0.0038, "num_tokens": 773538025.0, "reward": 0.01904296875, "reward_std": 0.019823549315333366, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15234375, "rewards/tag_count_reward/std": 0.15883885324001312, "step": 1803 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7058248153763388, "frac_reward_zero_std": 0.0, "grad_norm": 0.6024375362270181, "kl": 0.38037109375, "learning_rate": 4.839215820684342e-06, "loss": 0.0038, "num_tokens": 774093193.0, "reward": 0.020263671875, "reward_std": 0.020360250025987625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.162109375, "rewards/tag_count_reward/std": 0.16446539759635925, "step": 1804 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7062160708172348, "frac_reward_zero_std": 0.0, "grad_norm": 1.9960527640107188, "kl": 0.398681640625, "learning_rate": 4.827521056061717e-06, "loss": 0.004, "num_tokens": 774646825.0, "reward": 0.017822265625, "reward_std": 0.020003922283649445, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.142578125, "rewards/tag_count_reward/std": 0.16325005888938904, "step": 1805 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7066073262581308, "frac_reward_zero_std": 0.0, "grad_norm": 0.824424407775734, "kl": 0.403564453125, "learning_rate": 4.815835941781816e-06, "loss": 0.004, "num_tokens": 775199593.0, "reward": 0.0213623046875, "reward_std": 0.020721320062875748, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1708984375, "rewards/tag_count_reward/std": 0.1652175486087799, "step": 1806 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7069985816990267, "frac_reward_zero_std": 0.0, "grad_norm": 0.5566782110361209, "kl": 0.427978515625, "learning_rate": 4.804160499645667e-06, "loss": 0.0043, "num_tokens": 775752649.0, "reward": 0.0220947265625, "reward_std": 0.021993447095155716, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.1750541627407074, "step": 1807 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7073898371399228, "frac_reward_zero_std": 0.0, "grad_norm": 0.873866048489384, "kl": 0.435302734375, "learning_rate": 4.7924947514362495e-06, "loss": 0.0044, "num_tokens": 776306105.0, "reward": 0.022216796875, "reward_std": 0.020202476531267166, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.163156196475029, "step": 1808 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7077810925808187, "frac_reward_zero_std": 0.0, "grad_norm": 0.7342474963689908, "kl": 0.45263671875, "learning_rate": 4.780838718918467e-06, "loss": 0.0045, "num_tokens": 776859929.0, "reward": 0.0218505859375, "reward_std": 0.01955953985452652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.16106253862380981, "step": 1809 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7081723480217147, "frac_reward_zero_std": 0.0, "grad_norm": 1.3480515318078512, "kl": 0.46240234375, "learning_rate": 4.769192423839085e-06, "loss": 0.0046, "num_tokens": 777412953.0, "reward": 0.0234375, "reward_std": 0.01944587752223015, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1875, "rewards/tag_count_reward/std": 0.16269785165786743, "step": 1810 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7085636034626106, "frac_reward_zero_std": 0.0, "grad_norm": 0.7752430579213162, "kl": 0.476318359375, "learning_rate": 4.757555887926708e-06, "loss": 0.0048, "num_tokens": 777968569.0, "reward": 0.0194091796875, "reward_std": 0.02023458480834961, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1552734375, "rewards/tag_count_reward/std": 0.1643809676170349, "step": 1811 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7089548589035066, "frac_reward_zero_std": 0.0, "grad_norm": 0.7449195104029073, "kl": 0.47998046875, "learning_rate": 4.7459291328917275e-06, "loss": 0.0048, "num_tokens": 778523561.0, "reward": 0.020751953125, "reward_std": 0.021096836775541306, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.166015625, "rewards/tag_count_reward/std": 0.17086099088191986, "step": 1812 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7093461143444026, "frac_reward_zero_std": 0.0, "grad_norm": 0.8385945186470491, "kl": 0.489501953125, "learning_rate": 4.734312180426289e-06, "loss": 0.0049, "num_tokens": 779077257.0, "reward": 0.02197265625, "reward_std": 0.020997559651732445, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.17252273857593536, "step": 1813 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7097373697852986, "frac_reward_zero_std": 0.0, "grad_norm": 0.6296371495267139, "kl": 0.49365234375, "learning_rate": 4.722705052204256e-06, "loss": 0.0049, "num_tokens": 779631369.0, "reward": 0.0218505859375, "reward_std": 0.02057839184999466, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.16849961876869202, "step": 1814 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7101286252261946, "frac_reward_zero_std": 0.0, "grad_norm": 0.6066851147397134, "kl": 0.491455078125, "learning_rate": 4.711107769881153e-06, "loss": 0.0049, "num_tokens": 780184105.0, "reward": 0.020263671875, "reward_std": 0.02092280052602291, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.162109375, "rewards/tag_count_reward/std": 0.16887705028057098, "step": 1815 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7105198806670905, "frac_reward_zero_std": 0.0, "grad_norm": 0.5725415378924618, "kl": 0.470947265625, "learning_rate": 4.6995203550941425e-06, "loss": 0.0047, "num_tokens": 780737737.0, "reward": 0.0206298828125, "reward_std": 0.020441807806491852, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1650390625, "rewards/tag_count_reward/std": 0.16377411782741547, "step": 1816 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7109111361079865, "frac_reward_zero_std": 0.0, "grad_norm": 0.502997290288563, "kl": 0.4501953125, "learning_rate": 4.687942829461969e-06, "loss": 0.0045, "num_tokens": 781290985.0, "reward": 0.018310546875, "reward_std": 0.020037854090332985, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.146484375, "rewards/tag_count_reward/std": 0.15974043309688568, "step": 1817 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7113023915488825, "frac_reward_zero_std": 0.0, "grad_norm": 0.6670981623998415, "kl": 0.44140625, "learning_rate": 4.6763752145849295e-06, "loss": 0.0044, "num_tokens": 781845721.0, "reward": 0.0179443359375, "reward_std": 0.02019880712032318, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1435546875, "rewards/tag_count_reward/std": 0.16163218021392822, "step": 1818 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7116936469897784, "frac_reward_zero_std": 0.0, "grad_norm": 0.6755470558824049, "kl": 0.45947265625, "learning_rate": 4.66481753204484e-06, "loss": 0.0046, "num_tokens": 782401641.0, "reward": 0.01953125, "reward_std": 0.019288118928670883, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.15655608475208282, "step": 1819 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7120849024306745, "frac_reward_zero_std": 0.0, "grad_norm": 0.6373729461656026, "kl": 0.452392578125, "learning_rate": 4.653269803404973e-06, "loss": 0.0045, "num_tokens": 782955065.0, "reward": 0.0213623046875, "reward_std": 0.02033325657248497, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1708984375, "rewards/tag_count_reward/std": 0.1652175486087799, "step": 1820 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7124761578715704, "frac_reward_zero_std": 0.0, "grad_norm": 0.5238258880242257, "kl": 0.459716796875, "learning_rate": 4.641732050210032e-06, "loss": 0.0046, "num_tokens": 783508137.0, "reward": 0.022216796875, "reward_std": 0.020874392241239548, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.16905836760997772, "step": 1821 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7128674133124664, "frac_reward_zero_std": 0.0, "grad_norm": 0.6247993993739743, "kl": 0.46484375, "learning_rate": 4.630204293986122e-06, "loss": 0.0046, "num_tokens": 784062505.0, "reward": 0.021728515625, "reward_std": 0.019777875393629074, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.1613624393939972, "step": 1822 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7132586687533623, "frac_reward_zero_std": 0.0, "grad_norm": 0.5952971165940238, "kl": 0.478515625, "learning_rate": 4.618686556240675e-06, "loss": 0.0048, "num_tokens": 784613993.0, "reward": 0.0196533203125, "reward_std": 0.020481102168560028, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1572265625, "rewards/tag_count_reward/std": 0.16549547016620636, "step": 1823 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7136499241942583, "frac_reward_zero_std": 0.0, "grad_norm": 0.6171544159967504, "kl": 0.4794921875, "learning_rate": 4.607178858462445e-06, "loss": 0.0048, "num_tokens": 785168185.0, "reward": 0.0205078125, "reward_std": 0.020759668201208115, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1640625, "rewards/tag_count_reward/std": 0.16549836099147797, "step": 1824 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7140411796351543, "frac_reward_zero_std": 0.0, "grad_norm": 0.502678029453518, "kl": 0.45751953125, "learning_rate": 4.595681222121458e-06, "loss": 0.0046, "num_tokens": 785722505.0, "reward": 0.0206298828125, "reward_std": 0.020199930295348167, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1650390625, "rewards/tag_count_reward/std": 0.16820389032363892, "step": 1825 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7144324350760503, "frac_reward_zero_std": 0.0, "grad_norm": 0.5786621297506648, "kl": 0.467041015625, "learning_rate": 4.58419366866896e-06, "loss": 0.0047, "num_tokens": 786276985.0, "reward": 0.01953125, "reward_std": 0.020653298124670982, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.1641974002122879, "step": 1826 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7148236905169463, "frac_reward_zero_std": 0.0, "grad_norm": 0.7470951249281228, "kl": 0.44287109375, "learning_rate": 4.572716219537386e-06, "loss": 0.0044, "num_tokens": 786834777.0, "reward": 0.0220947265625, "reward_std": 0.020685534924268723, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.16790765523910522, "step": 1827 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7152149459578422, "frac_reward_zero_std": 0.0, "grad_norm": 0.4728682726132651, "kl": 0.43603515625, "learning_rate": 4.561248896140321e-06, "loss": 0.0044, "num_tokens": 787388841.0, "reward": 0.021484375, "reward_std": 0.01976374164223671, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.16344934701919556, "step": 1828 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7156062013987382, "frac_reward_zero_std": 0.0, "grad_norm": 0.5204311977038222, "kl": 0.4404296875, "learning_rate": 4.549791719872458e-06, "loss": 0.0044, "num_tokens": 787943305.0, "reward": 0.0223388671875, "reward_std": 0.02196553722023964, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.1786273866891861, "step": 1829 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7159974568396342, "frac_reward_zero_std": 0.0, "grad_norm": 0.5043861555113102, "kl": 0.43603515625, "learning_rate": 4.538344712109562e-06, "loss": 0.0044, "num_tokens": 788495625.0, "reward": 0.0203857421875, "reward_std": 0.02126336842775345, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1630859375, "rewards/tag_count_reward/std": 0.17010565102100372, "step": 1830 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7163887122805301, "frac_reward_zero_std": 0.0, "grad_norm": 0.510975724744681, "kl": 0.423828125, "learning_rate": 4.526907894208421e-06, "loss": 0.0042, "num_tokens": 789051961.0, "reward": 0.0206298828125, "reward_std": 0.020500754937529564, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1650390625, "rewards/tag_count_reward/std": 0.16674037277698517, "step": 1831 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7167799677214262, "frac_reward_zero_std": 0.0, "grad_norm": 0.5550707731551937, "kl": 0.404541015625, "learning_rate": 4.515481287506811e-06, "loss": 0.004, "num_tokens": 789605417.0, "reward": 0.0225830078125, "reward_std": 0.022243255749344826, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1806640625, "rewards/tag_count_reward/std": 0.17802608013153076, "step": 1832 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7171712231623221, "frac_reward_zero_std": 0.0, "grad_norm": 0.5745922984209817, "kl": 0.398681640625, "learning_rate": 4.504064913323472e-06, "loss": 0.004, "num_tokens": 790159353.0, "reward": 0.021728515625, "reward_std": 0.0207536444067955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.16437223553657532, "step": 1833 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7175624786032181, "frac_reward_zero_std": 0.0, "grad_norm": 0.6395252516006632, "kl": 0.40673828125, "learning_rate": 4.492658792958027e-06, "loss": 0.0041, "num_tokens": 790711145.0, "reward": 0.0216064453125, "reward_std": 0.020641759037971497, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.16761088371276855, "step": 1834 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.717953734044114, "frac_reward_zero_std": 0.0, "grad_norm": 0.4795577163385755, "kl": 0.402587890625, "learning_rate": 4.481262947690983e-06, "loss": 0.004, "num_tokens": 791264617.0, "reward": 0.0235595703125, "reward_std": 0.021252963691949844, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1884765625, "rewards/tag_count_reward/std": 0.16969992220401764, "step": 1835 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.71834498948501, "frac_reward_zero_std": 0.0, "grad_norm": 0.5123371008435987, "kl": 0.40185546875, "learning_rate": 4.469877398783686e-06, "loss": 0.004, "num_tokens": 791817561.0, "reward": 0.0220947265625, "reward_std": 0.021705960854887962, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.17223113775253296, "step": 1836 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.718736244925906, "frac_reward_zero_std": 0.0, "grad_norm": 0.5707776894353727, "kl": 0.42138671875, "learning_rate": 4.458502167478254e-06, "loss": 0.0042, "num_tokens": 792371449.0, "reward": 0.0220947265625, "reward_std": 0.020093224942684174, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.1634698510169983, "step": 1837 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.719127500366802, "frac_reward_zero_std": 0.0, "grad_norm": 0.5426696200777732, "kl": 0.42529296875, "learning_rate": 4.447137274997563e-06, "loss": 0.0043, "num_tokens": 792924761.0, "reward": 0.0223388671875, "reward_std": 0.020640097558498383, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.16581912338733673, "step": 1838 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.719518755807698, "frac_reward_zero_std": 0.0, "grad_norm": 0.7491403158725404, "kl": 0.4423828125, "learning_rate": 4.435782742545209e-06, "loss": 0.0044, "num_tokens": 793479625.0, "reward": 0.0208740234375, "reward_std": 0.02009897492825985, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1669921875, "rewards/tag_count_reward/std": 0.16025206446647644, "step": 1839 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7199100112485939, "frac_reward_zero_std": 0.0, "grad_norm": 0.6792408720795656, "kl": 0.461181640625, "learning_rate": 4.424438591305438e-06, "loss": 0.0046, "num_tokens": 794031689.0, "reward": 0.0225830078125, "reward_std": 0.019940240308642387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1806640625, "rewards/tag_count_reward/std": 0.1621762216091156, "step": 1840 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7203012666894899, "frac_reward_zero_std": 0.0, "grad_norm": 0.7798645030689713, "kl": 0.458251953125, "learning_rate": 4.413104842443149e-06, "loss": 0.0046, "num_tokens": 794584137.0, "reward": 0.0218505859375, "reward_std": 0.021747581660747528, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.17422084510326385, "step": 1841 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7206925221303859, "frac_reward_zero_std": 0.0, "grad_norm": 0.6010472577240394, "kl": 0.467041015625, "learning_rate": 4.401781517103819e-06, "loss": 0.0047, "num_tokens": 795141817.0, "reward": 0.02294921875, "reward_std": 0.02229209616780281, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.18359375, "rewards/tag_count_reward/std": 0.17982664704322815, "step": 1842 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7210837775712818, "frac_reward_zero_std": 0.0, "grad_norm": 0.6553582146138608, "kl": 0.46240234375, "learning_rate": 4.390468636413483e-06, "loss": 0.0046, "num_tokens": 795695113.0, "reward": 0.021728515625, "reward_std": 0.02036552131175995, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.16437223553657532, "step": 1843 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7214750330121779, "frac_reward_zero_std": 0.0, "grad_norm": 0.6399757584787502, "kl": 0.47021484375, "learning_rate": 4.379166221478697e-06, "loss": 0.0047, "num_tokens": 796249529.0, "reward": 0.02392578125, "reward_std": 0.021434517577290535, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.19140625, "rewards/tag_count_reward/std": 0.1742895245552063, "step": 1844 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7218662884530738, "frac_reward_zero_std": 0.0, "grad_norm": 0.563958009533358, "kl": 0.46630859375, "learning_rate": 4.367874293386471e-06, "loss": 0.0047, "num_tokens": 796803033.0, "reward": 0.023193359375, "reward_std": 0.02011454850435257, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.185546875, "rewards/tag_count_reward/std": 0.16193103790283203, "step": 1845 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7222575438939698, "frac_reward_zero_std": 0.0, "grad_norm": 0.7560412514438407, "kl": 0.467041015625, "learning_rate": 4.356592873204269e-06, "loss": 0.0047, "num_tokens": 797356617.0, "reward": 0.0205078125, "reward_std": 0.02070901170372963, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1640625, "rewards/tag_count_reward/std": 0.16697275638580322, "step": 1846 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7226487993348657, "frac_reward_zero_std": 0.0, "grad_norm": 0.6325063992095891, "kl": 0.45751953125, "learning_rate": 4.345321981979942e-06, "loss": 0.0046, "num_tokens": 797911369.0, "reward": 0.0218505859375, "reward_std": 0.020705994218587875, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.167038694024086, "step": 1847 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7230400547757617, "frac_reward_zero_std": 0.0, "grad_norm": 0.6394106468654341, "kl": 0.46240234375, "learning_rate": 4.3340616407416946e-06, "loss": 0.0046, "num_tokens": 798465625.0, "reward": 0.0224609375, "reward_std": 0.02130301296710968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.16988317668437958, "step": 1848 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7234313102166577, "frac_reward_zero_std": 0.0, "grad_norm": 0.7795921454704451, "kl": 0.468505859375, "learning_rate": 4.322811870498058e-06, "loss": 0.0047, "num_tokens": 799020073.0, "reward": 0.0238037109375, "reward_std": 0.022229855880141258, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1904296875, "rewards/tag_count_reward/std": 0.18018566071987152, "step": 1849 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1578.0, "completions/mean_length": 2046.1640625, "completions/mean_terminated_length": 1578.0, "completions/min_length": 1578.0, "completions/min_terminated_length": 1578.0, "epoch": 0.7238225656575537, "frac_reward_zero_std": 0.0, "grad_norm": 0.7289378966243152, "kl": 0.48779296875, "learning_rate": 4.311572692237834e-06, "loss": 0.0046, "num_tokens": 799574627.0, "reward": 0.0224609375, "reward_std": 0.020716294646263123, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.16988317668437958, "step": 1850 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7242138210984497, "frac_reward_zero_std": 0.0, "grad_norm": 0.8185750964902512, "kl": 0.477294921875, "learning_rate": 4.3003441269300535e-06, "loss": 0.0048, "num_tokens": 800127411.0, "reward": 0.024169921875, "reward_std": 0.020747246220707893, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.193359375, "rewards/tag_count_reward/std": 0.17068158090114594, "step": 1851 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7246050765393456, "frac_reward_zero_std": 0.0, "grad_norm": 0.8043896816357289, "kl": 0.474609375, "learning_rate": 4.289126195523968e-06, "loss": 0.0047, "num_tokens": 800681123.0, "reward": 0.02197265625, "reward_std": 0.020356856286525726, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.16377703845500946, "step": 1852 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7249963319802416, "frac_reward_zero_std": 0.0, "grad_norm": 0.6074686427548107, "kl": 0.474853515625, "learning_rate": 4.277918918948974e-06, "loss": 0.0047, "num_tokens": 801235043.0, "reward": 0.0244140625, "reward_std": 0.02158750779926777, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1953125, "rewards/tag_count_reward/std": 0.17555932700634003, "step": 1853 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7253875874211376, "frac_reward_zero_std": 0.0, "grad_norm": 0.6657823010145465, "kl": 0.477294921875, "learning_rate": 4.2667223181145875e-06, "loss": 0.0048, "num_tokens": 801788515.0, "reward": 0.0213623046875, "reward_std": 0.020319029688835144, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1708984375, "rewards/tag_count_reward/std": 0.1622234433889389, "step": 1854 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7257788428620335, "frac_reward_zero_std": 0.0, "grad_norm": 0.7158150288384826, "kl": 0.486083984375, "learning_rate": 4.2555364139104215e-06, "loss": 0.0049, "num_tokens": 802341699.0, "reward": 0.0198974609375, "reward_std": 0.02070966362953186, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1591796875, "rewards/tag_count_reward/std": 0.16510160267353058, "step": 1855 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7261700983029296, "frac_reward_zero_std": 0.0, "grad_norm": 0.8120098108453743, "kl": 0.483154296875, "learning_rate": 4.244361227206118e-06, "loss": 0.0048, "num_tokens": 802898515.0, "reward": 0.02197265625, "reward_std": 0.019460054114460945, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.1576773077249527, "step": 1856 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7265613537438255, "frac_reward_zero_std": 0.0, "grad_norm": 0.7499471807067755, "kl": 0.49072265625, "learning_rate": 4.2331967788513295e-06, "loss": 0.0049, "num_tokens": 803454131.0, "reward": 0.0208740234375, "reward_std": 0.020808303728699684, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1669921875, "rewards/tag_count_reward/std": 0.1677250862121582, "step": 1857 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7269526091847215, "frac_reward_zero_std": 0.0, "grad_norm": 0.6103882404195238, "kl": 0.50390625, "learning_rate": 4.222043089675673e-06, "loss": 0.005, "num_tokens": 804007955.0, "reward": 0.020751953125, "reward_std": 0.021023951470851898, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.166015625, "rewards/tag_count_reward/std": 0.1679675132036209, "step": 1858 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7273438646256174, "frac_reward_zero_std": 0.0, "grad_norm": 0.8092729163103541, "kl": 0.52490234375, "learning_rate": 4.2109001804886855e-06, "loss": 0.0052, "num_tokens": 804560435.0, "reward": 0.0185546875, "reward_std": 0.019681816920638084, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1484375, "rewards/tag_count_reward/std": 0.16250942647457123, "step": 1859 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7277351200665134, "frac_reward_zero_std": 0.0, "grad_norm": 0.7348940561013386, "kl": 0.53369140625, "learning_rate": 4.199768072079806e-06, "loss": 0.0053, "num_tokens": 805114035.0, "reward": 0.0228271484375, "reward_std": 0.021117866039276123, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1826171875, "rewards/tag_count_reward/std": 0.1717858612537384, "step": 1860 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7281263755074094, "frac_reward_zero_std": 0.0, "grad_norm": 0.6569939840832495, "kl": 0.544921875, "learning_rate": 4.188646785218316e-06, "loss": 0.0054, "num_tokens": 805669347.0, "reward": 0.0218505859375, "reward_std": 0.02130316011607647, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.17280830442905426, "step": 1861 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7285176309483054, "frac_reward_zero_std": 0.0, "grad_norm": 1.2119586831788454, "kl": 0.5615234375, "learning_rate": 4.177536340653291e-06, "loss": 0.0056, "num_tokens": 806224739.0, "reward": 0.02197265625, "reward_std": 0.021205518394708633, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.17109617590904236, "step": 1862 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7289088863892014, "frac_reward_zero_std": 0.0, "grad_norm": 0.7907018795293521, "kl": 0.57080078125, "learning_rate": 4.166436759113605e-06, "loss": 0.0057, "num_tokens": 806779283.0, "reward": 0.021240234375, "reward_std": 0.019576840102672577, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.169921875, "rewards/tag_count_reward/std": 0.15945248305797577, "step": 1863 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7293001418300973, "frac_reward_zero_std": 0.0, "grad_norm": 0.6974460757680379, "kl": 0.58203125, "learning_rate": 4.155348061307849e-06, "loss": 0.0058, "num_tokens": 807332467.0, "reward": 0.0216064453125, "reward_std": 0.021339021623134613, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.1719418317079544, "step": 1864 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7296913972709933, "frac_reward_zero_std": 0.0, "grad_norm": 1.2385231213734356, "kl": 0.56884765625, "learning_rate": 4.144270267924306e-06, "loss": 0.0057, "num_tokens": 807886627.0, "reward": 0.02099609375, "reward_std": 0.020622773095965385, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16796875, "rewards/tag_count_reward/std": 0.1630270630121231, "step": 1865 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7300826527118893, "frac_reward_zero_std": 0.0, "grad_norm": 1.0550832394352971, "kl": 0.572265625, "learning_rate": 4.1332033996309276e-06, "loss": 0.0057, "num_tokens": 808439635.0, "reward": 0.022216796875, "reward_std": 0.019955933094024658, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.163156196475029, "step": 1866 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7304739081527852, "frac_reward_zero_std": 0.0, "grad_norm": 0.8208849040456626, "kl": 0.5595703125, "learning_rate": 4.12214747707527e-06, "loss": 0.0056, "num_tokens": 808993395.0, "reward": 0.0203857421875, "reward_std": 0.020634641870856285, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1630859375, "rewards/tag_count_reward/std": 0.1657267063856125, "step": 1867 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7308651635936813, "frac_reward_zero_std": 0.0, "grad_norm": 0.6744872906455092, "kl": 0.5537109375, "learning_rate": 4.111102520884472e-06, "loss": 0.0055, "num_tokens": 809545891.0, "reward": 0.0220947265625, "reward_std": 0.0206784475594759, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.16496238112449646, "step": 1868 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7312564190345772, "frac_reward_zero_std": 0.0, "grad_norm": 0.6193068493848894, "kl": 0.5322265625, "learning_rate": 4.100068551665214e-06, "loss": 0.0053, "num_tokens": 810099859.0, "reward": 0.0189208984375, "reward_std": 0.020773645490407944, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1513671875, "rewards/tag_count_reward/std": 0.16653354465961456, "step": 1869 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7316476744754732, "frac_reward_zero_std": 0.0, "grad_norm": 0.842389691173441, "kl": 0.51416015625, "learning_rate": 4.089045590003671e-06, "loss": 0.0051, "num_tokens": 810653587.0, "reward": 0.0203857421875, "reward_std": 0.01995699666440487, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1630859375, "rewards/tag_count_reward/std": 0.16719910502433777, "step": 1870 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7320389299163691, "frac_reward_zero_std": 0.0, "grad_norm": 0.7349739684097389, "kl": 0.50927734375, "learning_rate": 4.0780336564654944e-06, "loss": 0.0051, "num_tokens": 811207843.0, "reward": 0.017822265625, "reward_std": 0.018734201788902283, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.142578125, "rewards/tag_count_reward/std": 0.15397858619689941, "step": 1871 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7324301853572651, "frac_reward_zero_std": 0.0, "grad_norm": 1.5322591540574204, "kl": 0.488525390625, "learning_rate": 4.067032771595749e-06, "loss": 0.0049, "num_tokens": 811761715.0, "reward": 0.0181884765625, "reward_std": 0.01966157928109169, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1455078125, "rewards/tag_count_reward/std": 0.1598692387342453, "step": 1872 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7328214407981611, "frac_reward_zero_std": 0.0, "grad_norm": 0.8618405219174097, "kl": 0.489990234375, "learning_rate": 4.05604295591889e-06, "loss": 0.0049, "num_tokens": 812313699.0, "reward": 0.0177001953125, "reward_std": 0.01948150247335434, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1416015625, "rewards/tag_count_reward/std": 0.15566988289356232, "step": 1873 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.733212696239057, "frac_reward_zero_std": 0.0, "grad_norm": 0.8132835592146342, "kl": 0.475830078125, "learning_rate": 4.045064229938718e-06, "loss": 0.0048, "num_tokens": 812868403.0, "reward": 0.0234375, "reward_std": 0.021533291786909103, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1875, "rewards/tag_count_reward/std": 0.17433346807956696, "step": 1874 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7336039516799531, "frac_reward_zero_std": 0.0, "grad_norm": 0.5956481554909657, "kl": 0.48046875, "learning_rate": 4.034096614138343e-06, "loss": 0.0048, "num_tokens": 813421411.0, "reward": 0.0201416015625, "reward_std": 0.019005466252565384, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1611328125, "rewards/tag_count_reward/std": 0.15549758076667786, "step": 1875 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.733995207120849, "frac_reward_zero_std": 0.0, "grad_norm": 0.6302591916696164, "kl": 0.4638671875, "learning_rate": 4.023140128980157e-06, "loss": 0.0046, "num_tokens": 813972595.0, "reward": 0.02099609375, "reward_std": 0.022137340158224106, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.16796875, "rewards/tag_count_reward/std": 0.17742541432380676, "step": 1876 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.734386462561745, "frac_reward_zero_std": 0.0, "grad_norm": 0.5884008131682038, "kl": 0.451904296875, "learning_rate": 4.0121947949057745e-06, "loss": 0.0045, "num_tokens": 814527939.0, "reward": 0.022216796875, "reward_std": 0.02122851088643074, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.17050199210643768, "step": 1877 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.734777718002641, "frac_reward_zero_std": 0.0, "grad_norm": 0.5787353460216426, "kl": 0.444091796875, "learning_rate": 4.001260632336008e-06, "loss": 0.0044, "num_tokens": 815081075.0, "reward": 0.0238037109375, "reward_std": 0.02107948437333107, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1904296875, "rewards/tag_count_reward/std": 0.1703980714082718, "step": 1878 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7351689734435369, "frac_reward_zero_std": 0.0, "grad_norm": 0.6044574870986389, "kl": 0.425048828125, "learning_rate": 3.990337661670827e-06, "loss": 0.0043, "num_tokens": 815636323.0, "reward": 0.0208740234375, "reward_std": 0.02082815393805504, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1669921875, "rewards/tag_count_reward/std": 0.1677250862121582, "step": 1879 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.735560228884433, "frac_reward_zero_std": 0.0, "grad_norm": 0.45554976133998504, "kl": 0.444580078125, "learning_rate": 3.97942590328932e-06, "loss": 0.0044, "num_tokens": 816190083.0, "reward": 0.0198974609375, "reward_std": 0.020606573671102524, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1591796875, "rewards/tag_count_reward/std": 0.17093661427497864, "step": 1880 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7359514843253289, "frac_reward_zero_std": 0.0, "grad_norm": 0.4818565141467152, "kl": 0.431884765625, "learning_rate": 3.968525377549657e-06, "loss": 0.0043, "num_tokens": 816744067.0, "reward": 0.022216796875, "reward_std": 0.02196822315454483, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.1747613102197647, "step": 1881 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7363427397662249, "frac_reward_zero_std": 0.0, "grad_norm": 3.864905113304056, "kl": 0.430419921875, "learning_rate": 3.957636104789056e-06, "loss": 0.0043, "num_tokens": 817296707.0, "reward": 0.0228271484375, "reward_std": 0.020744917914271355, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1826171875, "rewards/tag_count_reward/std": 0.1717858612537384, "step": 1882 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7367339952071208, "frac_reward_zero_std": 0.0, "grad_norm": 0.5327015385085957, "kl": 0.422607421875, "learning_rate": 3.946758105323733e-06, "loss": 0.0042, "num_tokens": 817848947.0, "reward": 0.0211181640625, "reward_std": 0.020306330174207687, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1689453125, "rewards/tag_count_reward/std": 0.1627655029296875, "step": 1883 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7371252506480168, "frac_reward_zero_std": 0.0, "grad_norm": 0.4445918656841841, "kl": 0.41357421875, "learning_rate": 3.935891399448874e-06, "loss": 0.0041, "num_tokens": 818403219.0, "reward": 0.0216064453125, "reward_std": 0.01900428533554077, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.15228737890720367, "step": 1884 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7375165060889128, "frac_reward_zero_std": 0.0, "grad_norm": 0.6670438232779881, "kl": 0.41845703125, "learning_rate": 3.925036007438594e-06, "loss": 0.0042, "num_tokens": 818957075.0, "reward": 0.0223388671875, "reward_std": 0.020994942635297775, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.16874943673610687, "step": 1885 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7379077615298087, "frac_reward_zero_std": 0.0, "grad_norm": 0.5364525007362753, "kl": 0.427978515625, "learning_rate": 3.914191949545899e-06, "loss": 0.0043, "num_tokens": 819511107.0, "reward": 0.0216064453125, "reward_std": 0.02171100676059723, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.17892730236053467, "step": 1886 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7382990169707048, "frac_reward_zero_std": 0.0, "grad_norm": 0.5432960888015599, "kl": 0.424560546875, "learning_rate": 3.903359246002655e-06, "loss": 0.0042, "num_tokens": 820066963.0, "reward": 0.0233154296875, "reward_std": 0.02095799148082733, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1865234375, "rewards/tag_count_reward/std": 0.16897623240947723, "step": 1887 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7386902724116007, "frac_reward_zero_std": 0.0, "grad_norm": 0.5889576657401207, "kl": 0.437744140625, "learning_rate": 3.892537917019537e-06, "loss": 0.0044, "num_tokens": 820623187.0, "reward": 0.0225830078125, "reward_std": 0.021598845720291138, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1806640625, "rewards/tag_count_reward/std": 0.17243115603923798, "step": 1888 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7390815278524967, "frac_reward_zero_std": 0.0, "grad_norm": 0.6494716429562977, "kl": 0.440673828125, "learning_rate": 3.881727982785999e-06, "loss": 0.0044, "num_tokens": 821176243.0, "reward": 0.0218505859375, "reward_std": 0.021914411336183548, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.1770121455192566, "step": 1889 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7394727832933927, "frac_reward_zero_std": 0.0, "grad_norm": 0.627278158896475, "kl": 0.4453125, "learning_rate": 3.8709294634702374e-06, "loss": 0.0045, "num_tokens": 821730259.0, "reward": 0.02197265625, "reward_std": 0.021109024062752724, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.1696576029062271, "step": 1890 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7398640387342886, "frac_reward_zero_std": 0.0, "grad_norm": 0.7034007980300001, "kl": 0.476806640625, "learning_rate": 3.860142379219153e-06, "loss": 0.0048, "num_tokens": 822284739.0, "reward": 0.0220947265625, "reward_std": 0.022003907710313797, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.1750541627407074, "step": 1891 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7402552941751847, "frac_reward_zero_std": 0.0, "grad_norm": 0.9449024440095636, "kl": 0.49755859375, "learning_rate": 3.849366750158305e-06, "loss": 0.005, "num_tokens": 822837827.0, "reward": 0.022705078125, "reward_std": 0.020656535401940346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.181640625, "rewards/tag_count_reward/std": 0.1648375391960144, "step": 1892 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7406465496160806, "frac_reward_zero_std": 0.0, "grad_norm": 1.0780994413067233, "kl": 0.506103515625, "learning_rate": 3.838602596391895e-06, "loss": 0.0051, "num_tokens": 823391059.0, "reward": 0.0223388671875, "reward_std": 0.020044401288032532, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.16433437168598175, "step": 1893 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7410378050569766, "frac_reward_zero_std": 0.0, "grad_norm": 0.7744064897410259, "kl": 0.462646484375, "learning_rate": 3.827849938002701e-06, "loss": 0.0046, "num_tokens": 823944595.0, "reward": 0.021484375, "reward_std": 0.021001184359192848, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.171875, "rewards/tag_count_reward/std": 0.1693412959575653, "step": 1894 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7414290604978725, "frac_reward_zero_std": 0.0, "grad_norm": 0.5548456170218129, "kl": 0.4501953125, "learning_rate": 3.817108795052061e-06, "loss": 0.0045, "num_tokens": 824498851.0, "reward": 0.0218505859375, "reward_std": 0.0199403278529644, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.16407781839370728, "step": 1895 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7418203159387685, "frac_reward_zero_std": 0.0, "grad_norm": 0.6746195921988428, "kl": 0.449462890625, "learning_rate": 3.806379187579825e-06, "loss": 0.0045, "num_tokens": 825051219.0, "reward": 0.022216796875, "reward_std": 0.019638294354081154, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.16465157270431519, "step": 1896 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7422115713796645, "frac_reward_zero_std": 0.0, "grad_norm": 0.6000209091891247, "kl": 0.4453125, "learning_rate": 3.7956611356043196e-06, "loss": 0.0045, "num_tokens": 825605859.0, "reward": 0.023193359375, "reward_std": 0.021053588017821312, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.185546875, "rewards/tag_count_reward/std": 0.16932998597621918, "step": 1897 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7426028268205604, "frac_reward_zero_std": 0.0, "grad_norm": 0.696536539212069, "kl": 0.445556640625, "learning_rate": 3.784954659122323e-06, "loss": 0.0045, "num_tokens": 826162099.0, "reward": 0.02294921875, "reward_std": 0.020804714411497116, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.18359375, "rewards/tag_count_reward/std": 0.17145392298698425, "step": 1898 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7429940822614565, "frac_reward_zero_std": 0.0, "grad_norm": 0.5773425631953616, "kl": 0.448486328125, "learning_rate": 3.7742597781090064e-06, "loss": 0.0045, "num_tokens": 826714403.0, "reward": 0.021728515625, "reward_std": 0.01961921527981758, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.1613624393939972, "step": 1899 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7433853377023524, "frac_reward_zero_std": 0.0, "grad_norm": 0.6676032435589961, "kl": 0.44970703125, "learning_rate": 3.7635765125179045e-06, "loss": 0.0045, "num_tokens": 827267283.0, "reward": 0.02197265625, "reward_std": 0.020686699077486992, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.17578125, "rewards/tag_count_reward/std": 0.16820673644542694, "step": 1900 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7437765931432484, "frac_reward_zero_std": 0.0, "grad_norm": 0.6314603651851597, "kl": 0.473388671875, "learning_rate": 3.752904882280899e-06, "loss": 0.0047, "num_tokens": 827819091.0, "reward": 0.0234375, "reward_std": 0.021038632839918137, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1875, "rewards/tag_count_reward/std": 0.1671561449766159, "step": 1901 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7441678485841444, "frac_reward_zero_std": 0.0, "grad_norm": 0.6332029271747673, "kl": 0.464111328125, "learning_rate": 3.7422449073081356e-06, "loss": 0.0046, "num_tokens": 828371043.0, "reward": 0.019775390625, "reward_std": 0.020707853138446808, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.158203125, "rewards/tag_count_reward/std": 0.16677770018577576, "step": 1902 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7445591040250403, "frac_reward_zero_std": 0.0, "grad_norm": 0.5669001357491029, "kl": 0.4609375, "learning_rate": 3.731596607488042e-06, "loss": 0.0046, "num_tokens": 828923955.0, "reward": 0.02294921875, "reward_std": 0.020225828513503075, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.18359375, "rewards/tag_count_reward/std": 0.16415074467658997, "step": 1903 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7449503594659364, "frac_reward_zero_std": 0.0, "grad_norm": 0.6238387054491171, "kl": 0.458740234375, "learning_rate": 3.7209600026872474e-06, "loss": 0.0046, "num_tokens": 829479107.0, "reward": 0.020263671875, "reward_std": 0.021182555705308914, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.162109375, "rewards/tag_count_reward/std": 0.17175519466400146, "step": 1904 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7453416149068323, "frac_reward_zero_std": 0.0, "grad_norm": 0.8166309045162294, "kl": 0.47802734375, "learning_rate": 3.7103351127505616e-06, "loss": 0.0048, "num_tokens": 830032243.0, "reward": 0.0216064453125, "reward_std": 0.021777190268039703, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1728515625, "rewards/tag_count_reward/std": 0.17476952075958252, "step": 1905 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7457328703477283, "frac_reward_zero_std": 0.0, "grad_norm": 0.6102038396636357, "kl": 0.467041015625, "learning_rate": 3.6997219575009533e-06, "loss": 0.0047, "num_tokens": 830587171.0, "reward": 0.0206298828125, "reward_std": 0.019586358219385147, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1650390625, "rewards/tag_count_reward/std": 0.15767426788806915, "step": 1906 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7461241257886242, "frac_reward_zero_std": 0.0, "grad_norm": 0.7336647916623195, "kl": 0.46923828125, "learning_rate": 3.689120556739475e-06, "loss": 0.0047, "num_tokens": 831140387.0, "reward": 0.0206298828125, "reward_std": 0.020574521273374557, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1650390625, "rewards/tag_count_reward/std": 0.16526390612125397, "step": 1907 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7465153812295202, "frac_reward_zero_std": 0.0, "grad_norm": 0.8505474757486574, "kl": 0.468505859375, "learning_rate": 3.678530930245261e-06, "loss": 0.0047, "num_tokens": 831694643.0, "reward": 0.0224609375, "reward_std": 0.02222328819334507, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.17694993317127228, "step": 1908 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7469066366704162, "frac_reward_zero_std": 0.0, "grad_norm": 0.5666685156344256, "kl": 0.46044921875, "learning_rate": 3.667953097775484e-06, "loss": 0.0046, "num_tokens": 832249571.0, "reward": 0.024169921875, "reward_std": 0.020934229716658592, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.193359375, "rewards/tag_count_reward/std": 0.16631780564785004, "step": 1909 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7472978921113121, "frac_reward_zero_std": 0.0, "grad_norm": 0.6096338982330071, "kl": 0.452880859375, "learning_rate": 3.657387079065302e-06, "loss": 0.0045, "num_tokens": 832806227.0, "reward": 0.0230712890625, "reward_std": 0.020446058362722397, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1845703125, "rewards/tag_count_reward/std": 0.17111575603485107, "step": 1910 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7476891475522082, "frac_reward_zero_std": 0.0, "grad_norm": 0.5043042663712594, "kl": 0.45947265625, "learning_rate": 3.6468328938278317e-06, "loss": 0.0046, "num_tokens": 833359427.0, "reward": 0.021728515625, "reward_std": 0.02009512111544609, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.1613624393939972, "step": 1911 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7480804029931041, "frac_reward_zero_std": 0.0, "grad_norm": 0.5614098622086809, "kl": 0.47021484375, "learning_rate": 3.6362905617541276e-06, "loss": 0.0047, "num_tokens": 833914259.0, "reward": 0.021728515625, "reward_std": 0.0203838013112545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.16585664451122284, "step": 1912 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7484716584340001, "frac_reward_zero_std": 0.0, "grad_norm": 0.5431301334491195, "kl": 0.488525390625, "learning_rate": 3.625760102513103e-06, "loss": 0.0049, "num_tokens": 834466371.0, "reward": 0.0233154296875, "reward_std": 0.02147107943892479, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1865234375, "rewards/tag_count_reward/std": 0.1746818572282791, "step": 1913 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7488629138748961, "frac_reward_zero_std": 0.0, "grad_norm": 0.7242643941980914, "kl": 0.47314453125, "learning_rate": 3.6152415357515458e-06, "loss": 0.0047, "num_tokens": 835020579.0, "reward": 0.0211181640625, "reward_std": 0.02016698569059372, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1689453125, "rewards/tag_count_reward/std": 0.16426444053649902, "step": 1914 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.749254169315792, "frac_reward_zero_std": 0.0, "grad_norm": 0.6130458577511035, "kl": 0.470703125, "learning_rate": 3.604734881094043e-06, "loss": 0.0047, "num_tokens": 835573827.0, "reward": 0.021728515625, "reward_std": 0.02256273478269577, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.173828125, "rewards/tag_count_reward/std": 0.18138517439365387, "step": 1915 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.749645424756688, "frac_reward_zero_std": 0.0, "grad_norm": 0.6839392377275689, "kl": 0.462890625, "learning_rate": 3.594240158142953e-06, "loss": 0.0046, "num_tokens": 836126899.0, "reward": 0.0218505859375, "reward_std": 0.02050463855266571, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.1655648797750473, "step": 1916 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.750036680197584, "frac_reward_zero_std": 0.0, "grad_norm": 1.2042602259024797, "kl": 0.474365234375, "learning_rate": 3.583757386478389e-06, "loss": 0.0047, "num_tokens": 836680867.0, "reward": 0.01953125, "reward_std": 0.019842837005853653, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.15625, "rewards/tag_count_reward/std": 0.16118435561656952, "step": 1917 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.75042793563848, "frac_reward_zero_std": 0.0, "grad_norm": 0.6911312928469789, "kl": 0.48291015625, "learning_rate": 3.5732865856581544e-06, "loss": 0.0048, "num_tokens": 837235491.0, "reward": 0.0220947265625, "reward_std": 0.022443167865276337, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.18056783080101013, "step": 1918 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.750819191079376, "frac_reward_zero_std": 0.0, "grad_norm": 0.48323404144269977, "kl": 0.465087890625, "learning_rate": 3.5628277752177134e-06, "loss": 0.0046, "num_tokens": 837789715.0, "reward": 0.022705078125, "reward_std": 0.020512346178293228, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.181640625, "rewards/tag_count_reward/std": 0.16631780564785004, "step": 1919 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7512104465202719, "frac_reward_zero_std": 0.0, "grad_norm": 0.6641059882307067, "kl": 0.463623046875, "learning_rate": 3.5523809746701753e-06, "loss": 0.0046, "num_tokens": 838343123.0, "reward": 0.0234375, "reward_std": 0.02074948325753212, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1875, "rewards/tag_count_reward/std": 0.16861605644226074, "step": 1920 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7516017019611679, "frac_reward_zero_std": 0.0, "grad_norm": 0.672750097332192, "kl": 0.458251953125, "learning_rate": 3.5419462035062313e-06, "loss": 0.0046, "num_tokens": 838896355.0, "reward": 0.0194091796875, "reward_std": 0.01922622323036194, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1552734375, "rewards/tag_count_reward/std": 0.15674859285354614, "step": 1921 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7519929574020638, "frac_reward_zero_std": 0.0, "grad_norm": 0.5461103447472736, "kl": 0.46826171875, "learning_rate": 3.53152348119413e-06, "loss": 0.0047, "num_tokens": 839450595.0, "reward": 0.022216796875, "reward_std": 0.020909275859594345, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.177734375, "rewards/tag_count_reward/std": 0.1676023155450821, "step": 1922 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7523842128429599, "frac_reward_zero_std": 0.0, "grad_norm": 0.6979756302372675, "kl": 0.482666015625, "learning_rate": 3.5211128271796524e-06, "loss": 0.0048, "num_tokens": 840004851.0, "reward": 0.0223388671875, "reward_std": 0.020919708535075188, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1787109375, "rewards/tag_count_reward/std": 0.16874943673610687, "step": 1923 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7527754682838558, "frac_reward_zero_std": 0.0, "grad_norm": 0.7459096301020026, "kl": 0.463623046875, "learning_rate": 3.5107142608860403e-06, "loss": 0.0046, "num_tokens": 840558291.0, "reward": 0.022705078125, "reward_std": 0.01985405758023262, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.181640625, "rewards/tag_count_reward/std": 0.16031478345394135, "step": 1924 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7531667237247518, "frac_reward_zero_std": 0.0, "grad_norm": 0.7620635182203602, "kl": 0.46337890625, "learning_rate": 3.500327801714006e-06, "loss": 0.0046, "num_tokens": 841112371.0, "reward": 0.0257568359375, "reward_std": 0.021935757249593735, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2060546875, "rewards/tag_count_reward/std": 0.1764921396970749, "step": 1925 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7535579791656478, "frac_reward_zero_std": 0.0, "grad_norm": 0.6123531871239758, "kl": 0.46142578125, "learning_rate": 3.4899534690416627e-06, "loss": 0.0046, "num_tokens": 841668227.0, "reward": 0.0225830078125, "reward_std": 0.021496152505278587, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1806640625, "rewards/tag_count_reward/std": 0.17243115603923798, "step": 1926 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7539492346065437, "frac_reward_zero_std": 0.0, "grad_norm": 0.5725889648117352, "kl": 0.453857421875, "learning_rate": 3.479591282224496e-06, "loss": 0.0045, "num_tokens": 842221443.0, "reward": 0.0196533203125, "reward_std": 0.021015292033553123, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1572265625, "rewards/tag_count_reward/std": 0.16988036036491394, "step": 1927 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7543404900474397, "frac_reward_zero_std": 0.0, "grad_norm": 0.6118945968383547, "kl": 0.45751953125, "learning_rate": 3.4692412605953417e-06, "loss": 0.0046, "num_tokens": 842776787.0, "reward": 0.024169921875, "reward_std": 0.02235218696296215, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.193359375, "rewards/tag_count_reward/std": 0.1804538071155548, "step": 1928 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7547317454883357, "frac_reward_zero_std": 0.0, "grad_norm": 0.6693483717806552, "kl": 0.432373046875, "learning_rate": 3.4589034234643315e-06, "loss": 0.0043, "num_tokens": 843331587.0, "reward": 0.0218505859375, "reward_std": 0.02059829793870449, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.16994798183441162, "step": 1929 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7551230009292317, "frac_reward_zero_std": 0.0, "grad_norm": 0.6779315517212142, "kl": 0.42578125, "learning_rate": 3.4485777901188633e-06, "loss": 0.0043, "num_tokens": 843887843.0, "reward": 0.02294921875, "reward_std": 0.023292526602745056, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.18359375, "rewards/tag_count_reward/std": 0.18912693858146667, "step": 1930 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7555142563701277, "frac_reward_zero_std": 0.0, "grad_norm": 0.5613569150982521, "kl": 0.430419921875, "learning_rate": 3.4382643798235704e-06, "loss": 0.0043, "num_tokens": 844440979.0, "reward": 0.022705078125, "reward_std": 0.021118011325597763, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.181640625, "rewards/tag_count_reward/std": 0.16923949122428894, "step": 1931 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7559055118110236, "frac_reward_zero_std": 0.0, "grad_norm": 1.683935226273944, "kl": 0.43310546875, "learning_rate": 3.4279632118202744e-06, "loss": 0.0043, "num_tokens": 844994515.0, "reward": 0.0218505859375, "reward_std": 0.021310217678546906, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1748046875, "rewards/tag_count_reward/std": 0.17138411104679108, "step": 1932 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7562967672519196, "frac_reward_zero_std": 0.0, "grad_norm": 0.7460028658059554, "kl": 0.4306640625, "learning_rate": 3.4176743053279705e-06, "loss": 0.0043, "num_tokens": 845548659.0, "reward": 0.0230712890625, "reward_std": 0.022249354049563408, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1845703125, "rewards/tag_count_reward/std": 0.17813360691070557, "step": 1933 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7566880226928155, "frac_reward_zero_std": 0.0, "grad_norm": 0.8673404326446942, "kl": 0.4326171875, "learning_rate": 3.4073976795427634e-06, "loss": 0.0043, "num_tokens": 846105555.0, "reward": 0.0233154296875, "reward_std": 0.022105790674686432, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1865234375, "rewards/tag_count_reward/std": 0.1760793775320053, "step": 1934 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7570792781337116, "frac_reward_zero_std": 0.0, "grad_norm": 1.1191694567688455, "kl": 0.427490234375, "learning_rate": 3.3971333536378503e-06, "loss": 0.0043, "num_tokens": 846660755.0, "reward": 0.023193359375, "reward_std": 0.02127516269683838, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.185546875, "rewards/tag_count_reward/std": 0.17220056056976318, "step": 1935 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7574705335746075, "frac_reward_zero_std": 0.0, "grad_norm": 0.8014915734066053, "kl": 0.4365234375, "learning_rate": 3.3868813467634833e-06, "loss": 0.0044, "num_tokens": 847215027.0, "reward": 0.0224609375, "reward_std": 0.020655345171689987, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1796875, "rewards/tag_count_reward/std": 0.16697275638580322, "step": 1936 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7578617890155035, "frac_reward_zero_std": 0.0, "grad_norm": 0.6721329507579695, "kl": 0.443359375, "learning_rate": 3.376641678046926e-06, "loss": 0.0044, "num_tokens": 847767859.0, "reward": 0.0220947265625, "reward_std": 0.02277851104736328, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1767578125, "rewards/tag_count_reward/std": 0.18723173439502716, "step": 1937 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7582530444563995, "frac_reward_zero_std": 0.0, "grad_norm": 0.8682069040015398, "kl": 0.457763671875, "learning_rate": 3.366414366592422e-06, "loss": 0.0046, "num_tokens": 848321555.0, "reward": 0.0230712890625, "reward_std": 0.023143084719777107, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1845703125, "rewards/tag_count_reward/std": 0.18882042169570923, "step": 1938 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7586442998972954, "frac_reward_zero_std": 0.0, "grad_norm": 0.9307342680849077, "kl": 0.4619140625, "learning_rate": 3.3561994314811697e-06, "loss": 0.0046, "num_tokens": 848877235.0, "reward": 0.0250244140625, "reward_std": 0.023964878171682358, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2001953125, "rewards/tag_count_reward/std": 0.19227728247642517, "step": 1939 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7590355553381914, "frac_reward_zero_std": 0.0, "grad_norm": 0.8777663888374037, "kl": 0.472412109375, "learning_rate": 3.345996891771267e-06, "loss": 0.0047, "num_tokens": 849430947.0, "reward": 0.025390625, "reward_std": 0.022634092718362808, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.203125, "rewards/tag_count_reward/std": 0.18324418365955353, "step": 1940 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7594268107790874, "frac_reward_zero_std": 0.0, "grad_norm": 1.3361828337361954, "kl": 0.464111328125, "learning_rate": 3.3358067664976866e-06, "loss": 0.0046, "num_tokens": 849984435.0, "reward": 0.024658203125, "reward_std": 0.024121953174471855, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.197265625, "rewards/tag_count_reward/std": 0.19340181350708008, "step": 1941 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7598180662199834, "frac_reward_zero_std": 0.0, "grad_norm": 1.2516886176097597, "kl": 0.46142578125, "learning_rate": 3.325629074672244e-06, "loss": 0.0046, "num_tokens": 850537891.0, "reward": 0.0238037109375, "reward_std": 0.02341405116021633, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1904296875, "rewards/tag_count_reward/std": 0.18817028403282166, "step": 1942 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7602093216608794, "frac_reward_zero_std": 0.0, "grad_norm": 0.7649790421978432, "kl": 0.469482421875, "learning_rate": 3.315463835283549e-06, "loss": 0.0047, "num_tokens": 851091427.0, "reward": 0.026123046875, "reward_std": 0.022961992770433426, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.208984375, "rewards/tag_count_reward/std": 0.1872854232788086, "step": 1943 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7606005771017753, "frac_reward_zero_std": 0.0, "grad_norm": 0.6529893810603937, "kl": 0.466552734375, "learning_rate": 3.3053110672969926e-06, "loss": 0.0047, "num_tokens": 851645443.0, "reward": 0.0240478515625, "reward_std": 0.02267666533589363, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.1923828125, "rewards/tag_count_reward/std": 0.18484382331371307, "step": 1944 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7609918325426713, "frac_reward_zero_std": 0.0, "grad_norm": 0.9748322863735814, "kl": 0.468017578125, "learning_rate": 3.2951707896546858e-06, "loss": 0.0047, "num_tokens": 852198771.0, "reward": 0.02587890625, "reward_std": 0.02404019609093666, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.20703125, "rewards/tag_count_reward/std": 0.19455675780773163, "step": 1945 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7613830879835672, "frac_reward_zero_std": 0.0, "grad_norm": 0.9776168722990216, "kl": 0.46728515625, "learning_rate": 3.2850430212754403e-06, "loss": 0.0047, "num_tokens": 852751891.0, "reward": 0.025634765625, "reward_std": 0.026032857596874237, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.205078125, "rewards/tag_count_reward/std": 0.21105150878429413, "step": 1946 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7617743434244633, "frac_reward_zero_std": 0.0, "grad_norm": 0.7982550660240981, "kl": 0.466064453125, "learning_rate": 3.2749277810547286e-06, "loss": 0.0047, "num_tokens": 853304643.0, "reward": 0.0296630859375, "reward_std": 0.027166005223989487, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2373046875, "rewards/tag_count_reward/std": 0.21936844289302826, "step": 1947 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7621655988653592, "frac_reward_zero_std": 0.0, "grad_norm": 0.937845398996511, "kl": 0.483642578125, "learning_rate": 3.2648250878646514e-06, "loss": 0.0048, "num_tokens": 853857827.0, "reward": 0.035888671875, "reward_std": 0.024977805092930794, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.287109375, "rewards/tag_count_reward/std": 0.2102515995502472, "step": 1948 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7625568543062552, "frac_reward_zero_std": 0.0, "grad_norm": 1.165498991344451, "kl": 0.497314453125, "learning_rate": 3.2547349605538936e-06, "loss": 0.005, "num_tokens": 854411795.0, "reward": 0.03857421875, "reward_std": 0.027178827673196793, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.30859375, "rewards/tag_count_reward/std": 0.22136913239955902, "step": 1949 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7629481097471512, "frac_reward_zero_std": 0.0, "grad_norm": 0.7528722611716518, "kl": 0.51123046875, "learning_rate": 3.2446574179477075e-06, "loss": 0.0051, "num_tokens": 854966387.0, "reward": 0.0380859375, "reward_std": 0.027371473610401154, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3046875, "rewards/tag_count_reward/std": 0.2212653011083603, "step": 1950 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7633393651880471, "frac_reward_zero_std": 0.0, "grad_norm": 0.6570907911237381, "kl": 0.5341796875, "learning_rate": 3.2345924788478566e-06, "loss": 0.0053, "num_tokens": 855518003.0, "reward": 0.0418701171875, "reward_std": 0.029873067513108253, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3349609375, "rewards/tag_count_reward/std": 0.24325892329216003, "step": 1951 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7637306206289431, "frac_reward_zero_std": 0.0, "grad_norm": 0.8971116668720559, "kl": 0.5341796875, "learning_rate": 3.2245401620325934e-06, "loss": 0.0053, "num_tokens": 856071027.0, "reward": 0.0438232421875, "reward_std": 0.030629953369498253, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.24532821774482727, "step": 1952 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7641218760698391, "frac_reward_zero_std": 0.0, "grad_norm": 1.4687027626697744, "kl": 0.533203125, "learning_rate": 3.2145004862566185e-06, "loss": 0.0053, "num_tokens": 856625859.0, "reward": 0.0433349609375, "reward_std": 0.028791280463337898, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.2388262003660202, "step": 1953 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7645131315107351, "frac_reward_zero_std": 0.0, "grad_norm": 0.8553391563733302, "kl": 0.53369140625, "learning_rate": 3.204473470251047e-06, "loss": 0.0053, "num_tokens": 857178595.0, "reward": 0.04052734375, "reward_std": 0.027744639664888382, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.32421875, "rewards/tag_count_reward/std": 0.22109216451644897, "step": 1954 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7649043869516311, "frac_reward_zero_std": 0.0, "grad_norm": 1.121070126887146, "kl": 0.5517578125, "learning_rate": 3.1944591327233844e-06, "loss": 0.0055, "num_tokens": 857730691.0, "reward": 0.0458984375, "reward_std": 0.031637709587812424, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.25521522760391235, "step": 1955 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.765295642392527, "frac_reward_zero_std": 0.0, "grad_norm": 0.915707357968839, "kl": 0.5380859375, "learning_rate": 3.1844574923574713e-06, "loss": 0.0054, "num_tokens": 858284243.0, "reward": 0.04052734375, "reward_std": 0.02972964197397232, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.32421875, "rewards/tag_count_reward/std": 0.2381698489189148, "step": 1956 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.765686897833423, "frac_reward_zero_std": 0.0, "grad_norm": 0.8850857900791225, "kl": 0.54052734375, "learning_rate": 3.174468567813461e-06, "loss": 0.0054, "num_tokens": 858837907.0, "reward": 0.0396728515625, "reward_std": 0.02773735299706459, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3173828125, "rewards/tag_count_reward/std": 0.22816529870033264, "step": 1957 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7660781532743189, "frac_reward_zero_std": 0.0, "grad_norm": 0.8888279211122827, "kl": 0.53173828125, "learning_rate": 3.1644923777277857e-06, "loss": 0.0053, "num_tokens": 859391075.0, "reward": 0.0439453125, "reward_std": 0.027613326907157898, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.22237025201320648, "step": 1958 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.766469408715215, "frac_reward_zero_std": 0.0, "grad_norm": 0.7980746836183082, "kl": 0.5224609375, "learning_rate": 3.1545289407131128e-06, "loss": 0.0052, "num_tokens": 859945507.0, "reward": 0.0462646484375, "reward_std": 0.028968192636966705, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.23943477869033813, "step": 1959 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7668606641561109, "frac_reward_zero_std": 0.0, "grad_norm": 0.9446731784603769, "kl": 0.52734375, "learning_rate": 3.144578275358329e-06, "loss": 0.0053, "num_tokens": 860499011.0, "reward": 0.04443359375, "reward_std": 0.03074382245540619, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.24775339663028717, "step": 1960 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7672519195970069, "frac_reward_zero_std": 0.0, "grad_norm": 1.1091131842398874, "kl": 0.51953125, "learning_rate": 3.134640400228479e-06, "loss": 0.0052, "num_tokens": 861051011.0, "reward": 0.046142578125, "reward_std": 0.028127595782279968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2289465367794037, "step": 1961 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7676431750379029, "frac_reward_zero_std": 0.0, "grad_norm": 1.1207833267252085, "kl": 0.51513671875, "learning_rate": 3.1247153338647486e-06, "loss": 0.0051, "num_tokens": 861604307.0, "reward": 0.042236328125, "reward_std": 0.02817816101014614, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.22814221680164337, "step": 1962 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7680344304787988, "frac_reward_zero_std": 0.0, "grad_norm": 1.9291696494739736, "kl": 0.504150390625, "learning_rate": 3.1148030947844353e-06, "loss": 0.005, "num_tokens": 862159507.0, "reward": 0.0404052734375, "reward_std": 0.028357570990920067, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.2338024079799652, "step": 1963 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7684256859196948, "frac_reward_zero_std": 0.0, "grad_norm": 0.8773914940160872, "kl": 0.50927734375, "learning_rate": 3.10490370148089e-06, "loss": 0.0051, "num_tokens": 862712739.0, "reward": 0.0438232421875, "reward_std": 0.029566515237092972, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.24332188069820404, "step": 1964 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7688169413605908, "frac_reward_zero_std": 0.0, "grad_norm": 0.6884045032294623, "kl": 0.500732421875, "learning_rate": 3.095017172423502e-06, "loss": 0.005, "num_tokens": 863265379.0, "reward": 0.0390625, "reward_std": 0.027806444093585014, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3125, "rewards/tag_count_reward/std": 0.2268713116645813, "step": 1965 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7692081968014868, "frac_reward_zero_std": 0.0, "grad_norm": 0.8394733569070071, "kl": 0.492919921875, "learning_rate": 3.0851435260576703e-06, "loss": 0.0049, "num_tokens": 863817747.0, "reward": 0.0379638671875, "reward_std": 0.02727663889527321, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3037109375, "rewards/tag_count_reward/std": 0.22205789387226105, "step": 1966 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7695994522423828, "frac_reward_zero_std": 0.0, "grad_norm": 0.6601978258500333, "kl": 0.485107421875, "learning_rate": 3.0752827808047446e-06, "loss": 0.0048, "num_tokens": 864372003.0, "reward": 0.033203125, "reward_std": 0.025773944333195686, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.265625, "rewards/tag_count_reward/std": 0.21062465012073517, "step": 1967 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7699907076832787, "frac_reward_zero_std": 0.0, "grad_norm": 0.7613884542249577, "kl": 0.4736328125, "learning_rate": 3.065434955062011e-06, "loss": 0.0047, "num_tokens": 864927411.0, "reward": 0.033203125, "reward_std": 0.02654825709760189, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.265625, "rewards/tag_count_reward/std": 0.21861866116523743, "step": 1968 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7703819631241747, "frac_reward_zero_std": 0.0, "grad_norm": 0.5866323446594919, "kl": 0.466064453125, "learning_rate": 3.055600067202652e-06, "loss": 0.0047, "num_tokens": 865483635.0, "reward": 0.032958984375, "reward_std": 0.02635674551129341, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.263671875, "rewards/tag_count_reward/std": 0.21536242961883545, "step": 1969 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7707732185650706, "frac_reward_zero_std": 0.0, "grad_norm": 0.6071377222992436, "kl": 0.470458984375, "learning_rate": 3.0457781355757055e-06, "loss": 0.0047, "num_tokens": 866037283.0, "reward": 0.0350341796875, "reward_std": 0.02775176241993904, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2802734375, "rewards/tag_count_reward/std": 0.22861802577972412, "step": 1970 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7711644740059667, "frac_reward_zero_std": 0.0, "grad_norm": 0.6348296157897659, "kl": 0.479248046875, "learning_rate": 3.0359691785060484e-06, "loss": 0.0048, "num_tokens": 866590771.0, "reward": 0.036376953125, "reward_std": 0.025486111640930176, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.291015625, "rewards/tag_count_reward/std": 0.20716892182826996, "step": 1971 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7715557294468626, "frac_reward_zero_std": 0.0, "grad_norm": 0.6913926684058548, "kl": 0.47705078125, "learning_rate": 3.0261732142943435e-06, "loss": 0.0048, "num_tokens": 867144531.0, "reward": 0.039794921875, "reward_std": 0.02964971214532852, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.318359375, "rewards/tag_count_reward/std": 0.2399241030216217, "step": 1972 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7719469848877586, "frac_reward_zero_std": 0.0, "grad_norm": 0.8536569209165216, "kl": 0.484619140625, "learning_rate": 3.0163902612170083e-06, "loss": 0.0048, "num_tokens": 867699555.0, "reward": 0.0367431640625, "reward_std": 0.026531707495450974, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2939453125, "rewards/tag_count_reward/std": 0.2152801752090454, "step": 1973 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7723382403286546, "frac_reward_zero_std": 0.0, "grad_norm": 0.5039720941984255, "kl": 0.46533203125, "learning_rate": 3.0066203375262003e-06, "loss": 0.0047, "num_tokens": 868251043.0, "reward": 0.0391845703125, "reward_std": 0.02947092428803444, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3134765625, "rewards/tag_count_reward/std": 0.237684965133667, "step": 1974 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7727294957695505, "frac_reward_zero_std": 0.0, "grad_norm": 0.7252294129039414, "kl": 0.454345703125, "learning_rate": 2.9968634614497495e-06, "loss": 0.0045, "num_tokens": 868807139.0, "reward": 0.0362548828125, "reward_std": 0.027655985206365585, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2900390625, "rewards/tag_count_reward/std": 0.22274667024612427, "step": 1975 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7731207512104465, "frac_reward_zero_std": 0.0, "grad_norm": 0.8612124068738229, "kl": 0.455322265625, "learning_rate": 2.9871196511911503e-06, "loss": 0.0045, "num_tokens": 869359795.0, "reward": 0.04150390625, "reward_std": 0.029799509793519974, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.24072882533073425, "step": 1976 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7735120066513425, "frac_reward_zero_std": 0.0, "grad_norm": 0.6223339756796409, "kl": 0.476806640625, "learning_rate": 2.9773889249295294e-06, "loss": 0.0048, "num_tokens": 869912563.0, "reward": 0.041015625, "reward_std": 0.03128008544445038, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.25581473112106323, "step": 1977 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7739032620922385, "frac_reward_zero_std": 0.0, "grad_norm": 0.6615682273742799, "kl": 0.483642578125, "learning_rate": 2.96767130081959e-06, "loss": 0.0048, "num_tokens": 870466627.0, "reward": 0.0428466796875, "reward_std": 0.02906278520822525, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.23832859098911285, "step": 1978 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7742945175331345, "frac_reward_zero_std": 0.0, "grad_norm": 0.6962342220328769, "kl": 0.48583984375, "learning_rate": 2.957966796991593e-06, "loss": 0.0049, "num_tokens": 871019219.0, "reward": 0.04345703125, "reward_std": 0.028602004051208496, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.22952289879322052, "step": 1979 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7746857729740304, "frac_reward_zero_std": 0.0, "grad_norm": 0.7004155938288572, "kl": 0.489013671875, "learning_rate": 2.9482754315513305e-06, "loss": 0.0049, "num_tokens": 871574787.0, "reward": 0.0439453125, "reward_std": 0.029275331646203995, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23313191533088684, "step": 1980 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7750770284149264, "frac_reward_zero_std": 0.0, "grad_norm": 0.7144236735522924, "kl": 0.496337890625, "learning_rate": 2.938597222580063e-06, "loss": 0.005, "num_tokens": 872128931.0, "reward": 0.0423583984375, "reward_std": 0.029157068580389023, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3388671875, "rewards/tag_count_reward/std": 0.23879413306713104, "step": 1981 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7754682838558223, "frac_reward_zero_std": 0.0, "grad_norm": 0.6949781585847377, "kl": 0.490966796875, "learning_rate": 2.9289321881345257e-06, "loss": 0.0049, "num_tokens": 872683139.0, "reward": 0.045166015625, "reward_std": 0.029974717646837234, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2411341667175293, "step": 1982 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7758595392967184, "frac_reward_zero_std": 0.0, "grad_norm": 0.8881574795744979, "kl": 0.492431640625, "learning_rate": 2.919280346246861e-06, "loss": 0.0049, "num_tokens": 873236787.0, "reward": 0.0439453125, "reward_std": 0.028339773416519165, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23522518575191498, "step": 1983 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7762507947376143, "frac_reward_zero_std": 0.0, "grad_norm": 0.669513676922208, "kl": 0.498291015625, "learning_rate": 2.909641714924597e-06, "loss": 0.005, "num_tokens": 873789699.0, "reward": 0.0433349609375, "reward_std": 0.028985004872083664, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.23363855481147766, "step": 1984 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7766420501785103, "frac_reward_zero_std": 0.0, "grad_norm": 1.0559170801460234, "kl": 0.49658203125, "learning_rate": 2.900016312150631e-06, "loss": 0.005, "num_tokens": 874343843.0, "reward": 0.0379638671875, "reward_std": 0.027361854910850525, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3037109375, "rewards/tag_count_reward/std": 0.22315892577171326, "step": 1985 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7770333056194063, "frac_reward_zero_std": 0.0, "grad_norm": 0.7340088197309699, "kl": 0.494384765625, "learning_rate": 2.8904041558831532e-06, "loss": 0.0049, "num_tokens": 874898355.0, "reward": 0.0364990234375, "reward_std": 0.028899017721414566, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.2919921875, "rewards/tag_count_reward/std": 0.2331462949514389, "step": 1986 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7774245610603022, "frac_reward_zero_std": 0.0, "grad_norm": 0.7331127570520829, "kl": 0.4951171875, "learning_rate": 2.8808052640556637e-06, "loss": 0.005, "num_tokens": 875453507.0, "reward": 0.0401611328125, "reward_std": 0.02789401262998581, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3212890625, "rewards/tag_count_reward/std": 0.22697046399116516, "step": 1987 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7778158165011982, "frac_reward_zero_std": 0.0, "grad_norm": 0.686723091061755, "kl": 0.49169921875, "learning_rate": 2.871219654576903e-06, "loss": 0.0049, "num_tokens": 876006435.0, "reward": 0.0401611328125, "reward_std": 0.028544235974550247, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3212890625, "rewards/tag_count_reward/std": 0.2344076782464981, "step": 1988 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7782070719420942, "frac_reward_zero_std": 0.0, "grad_norm": 0.8067887412035678, "kl": 0.489501953125, "learning_rate": 2.8616473453308303e-06, "loss": 0.0049, "num_tokens": 876560211.0, "reward": 0.0416259765625, "reward_std": 0.02961357682943344, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3330078125, "rewards/tag_count_reward/std": 0.23885826766490936, "step": 1989 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7785983273829902, "frac_reward_zero_std": 0.0, "grad_norm": 0.938931222393111, "kl": 0.482421875, "learning_rate": 2.8520883541765976e-06, "loss": 0.0048, "num_tokens": 877113827.0, "reward": 0.0379638671875, "reward_std": 0.02692343108355999, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3037109375, "rewards/tag_count_reward/std": 0.21983928978443146, "step": 1990 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7789895828238862, "frac_reward_zero_std": 0.0, "grad_norm": 1.152526714846787, "kl": 0.47900390625, "learning_rate": 2.842542698948507e-06, "loss": 0.0048, "num_tokens": 877666995.0, "reward": 0.04345703125, "reward_std": 0.030026447027921677, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.24701032042503357, "step": 1991 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7793808382647821, "frac_reward_zero_std": 0.0, "grad_norm": 0.759300684102559, "kl": 0.46923828125, "learning_rate": 2.8330103974559665e-06, "loss": 0.0047, "num_tokens": 878222419.0, "reward": 0.042724609375, "reward_std": 0.028825294226408005, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.341796875, "rewards/tag_count_reward/std": 0.23404183983802795, "step": 1992 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7797720937056781, "frac_reward_zero_std": 0.0, "grad_norm": 6.138655118656142, "kl": 0.4580078125, "learning_rate": 2.8234914674834913e-06, "loss": 0.0046, "num_tokens": 878776051.0, "reward": 0.0445556640625, "reward_std": 0.031191829591989517, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.2527252733707428, "step": 1993 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.780163349146574, "frac_reward_zero_std": 0.0, "grad_norm": 0.5959449725939634, "kl": 0.456298828125, "learning_rate": 2.8139859267906346e-06, "loss": 0.0046, "num_tokens": 879331459.0, "reward": 0.0400390625, "reward_std": 0.031145654618740082, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3203125, "rewards/tag_count_reward/std": 0.2532872259616852, "step": 1994 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7805546045874701, "frac_reward_zero_std": 0.0, "grad_norm": 0.6560939693005263, "kl": 0.444580078125, "learning_rate": 2.804493793111971e-06, "loss": 0.0044, "num_tokens": 879885603.0, "reward": 0.040283203125, "reward_std": 0.029690947383642197, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.2356724739074707, "step": 1995 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.780945860028366, "frac_reward_zero_std": 0.0, "grad_norm": 0.6237967854278182, "kl": 0.45166015625, "learning_rate": 2.7950150841570687e-06, "loss": 0.0045, "num_tokens": 880439283.0, "reward": 0.0418701171875, "reward_std": 0.027939297258853912, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3349609375, "rewards/tag_count_reward/std": 0.22764436900615692, "step": 1996 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.781337115469262, "frac_reward_zero_std": 0.0, "grad_norm": 0.6223704999299694, "kl": 0.446533203125, "learning_rate": 2.7855498176104435e-06, "loss": 0.0045, "num_tokens": 880991635.0, "reward": 0.0455322265625, "reward_std": 0.030803054571151733, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.24829187989234924, "step": 1997 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.781728370910158, "frac_reward_zero_std": 0.0, "grad_norm": 0.7571667693017281, "kl": 0.438720703125, "learning_rate": 2.7760980111315307e-06, "loss": 0.0044, "num_tokens": 881543923.0, "reward": 0.043212890625, "reward_std": 0.02969180978834629, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.24176859855651855, "step": 1998 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7821196263510539, "frac_reward_zero_std": 0.0, "grad_norm": 0.7096890826457914, "kl": 0.429931640625, "learning_rate": 2.7666596823546554e-06, "loss": 0.0043, "num_tokens": 882099459.0, "reward": 0.0433349609375, "reward_std": 0.027711046859622, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.22940398752689362, "step": 1999 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7825108817919499, "frac_reward_zero_std": 0.0, "grad_norm": 0.634746852201608, "kl": 0.421875, "learning_rate": 2.757234848888993e-06, "loss": 0.0042, "num_tokens": 882655795.0, "reward": 0.0411376953125, "reward_std": 0.028335511684417725, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3291015625, "rewards/tag_count_reward/std": 0.23081861436367035, "step": 2000 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7829021372328459, "frac_reward_zero_std": 0.0, "grad_norm": 0.6687567385291985, "kl": 0.422607421875, "learning_rate": 2.7478235283185504e-06, "loss": 0.0042, "num_tokens": 883211475.0, "reward": 0.0413818359375, "reward_std": 0.0284347552806139, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3310546875, "rewards/tag_count_reward/std": 0.22906987369060516, "step": 2001 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7832933926737419, "frac_reward_zero_std": 0.0, "grad_norm": 0.6676229600365733, "kl": 0.41064453125, "learning_rate": 2.7384257382021185e-06, "loss": 0.0041, "num_tokens": 883764243.0, "reward": 0.04248046875, "reward_std": 0.028536302968859673, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.22521096467971802, "step": 2002 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7836846481146379, "frac_reward_zero_std": 0.0, "grad_norm": 0.5393725311797078, "kl": 0.423095703125, "learning_rate": 2.7290414960732313e-06, "loss": 0.0042, "num_tokens": 884315987.0, "reward": 0.0450439453125, "reward_std": 0.027184048667550087, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.2220233976840973, "step": 2003 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7840759035555338, "frac_reward_zero_std": 0.0, "grad_norm": 0.6747798159038741, "kl": 0.4267578125, "learning_rate": 2.719670819440169e-06, "loss": 0.0043, "num_tokens": 884870883.0, "reward": 0.0408935546875, "reward_std": 0.026574168354272842, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3271484375, "rewards/tag_count_reward/std": 0.2172810435295105, "step": 2004 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7844671589964298, "frac_reward_zero_std": 0.0, "grad_norm": 0.6764616667438746, "kl": 0.425537109375, "learning_rate": 2.7103137257858867e-06, "loss": 0.0043, "num_tokens": 885424403.0, "reward": 0.0430908203125, "reward_std": 0.028123702853918076, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.22700420022010803, "step": 2005 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7848584144373257, "frac_reward_zero_std": 0.0, "grad_norm": 0.6263648862216283, "kl": 0.427490234375, "learning_rate": 2.700970232568001e-06, "loss": 0.0043, "num_tokens": 885978627.0, "reward": 0.046875, "reward_std": 0.029901225119829178, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24753689765930176, "step": 2006 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7852496698782218, "frac_reward_zero_std": 0.0, "grad_norm": 0.5589080859043182, "kl": 0.415283203125, "learning_rate": 2.691640357218759e-06, "loss": 0.0042, "num_tokens": 886533027.0, "reward": 0.044189453125, "reward_std": 0.029713209718465805, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.24359913170337677, "step": 2007 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7856409253191177, "frac_reward_zero_std": 0.0, "grad_norm": 0.6445950736964623, "kl": 0.434326171875, "learning_rate": 2.6823241171449966e-06, "loss": 0.0043, "num_tokens": 887087235.0, "reward": 0.040283203125, "reward_std": 0.030157487839460373, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.24783839285373688, "step": 2008 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7860321807600137, "frac_reward_zero_std": 0.0, "grad_norm": 0.6655317178824487, "kl": 0.439697265625, "learning_rate": 2.673021529728108e-06, "loss": 0.0044, "num_tokens": 887640723.0, "reward": 0.043212890625, "reward_std": 0.030673181638121605, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.2477765679359436, "step": 2009 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7864234362009097, "frac_reward_zero_std": 0.0, "grad_norm": 0.9443723340204166, "kl": 0.44482421875, "learning_rate": 2.6637326123240216e-06, "loss": 0.0044, "num_tokens": 888196451.0, "reward": 0.040283203125, "reward_std": 0.03323676064610481, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.265042245388031, "step": 2010 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7868146916418056, "frac_reward_zero_std": 0.0, "grad_norm": 0.7399952503219482, "kl": 0.44580078125, "learning_rate": 2.6544573822631523e-06, "loss": 0.0045, "num_tokens": 888751795.0, "reward": 0.0386962890625, "reward_std": 0.027586933225393295, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3095703125, "rewards/tag_count_reward/std": 0.22055236995220184, "step": 2011 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7872059470827016, "frac_reward_zero_std": 0.0, "grad_norm": 0.6674544626844494, "kl": 0.458984375, "learning_rate": 2.645195856850391e-06, "loss": 0.0046, "num_tokens": 889306627.0, "reward": 0.04296875, "reward_std": 0.03223186731338501, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.2619684338569641, "step": 2012 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7875972025235976, "frac_reward_zero_std": 0.0, "grad_norm": 0.7607055744322534, "kl": 0.45751953125, "learning_rate": 2.6359480533650506e-06, "loss": 0.0046, "num_tokens": 889860227.0, "reward": 0.0435791015625, "reward_std": 0.03023010492324829, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.24210694432258606, "step": 2013 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7879884579644936, "frac_reward_zero_std": 0.0, "grad_norm": 0.8905758544583906, "kl": 0.460205078125, "learning_rate": 2.626713989060845e-06, "loss": 0.0046, "num_tokens": 890414931.0, "reward": 0.0411376953125, "reward_std": 0.029780978336930275, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3291015625, "rewards/tag_count_reward/std": 0.24120363593101501, "step": 2014 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7883797134053896, "frac_reward_zero_std": 0.0, "grad_norm": 0.7375190589427283, "kl": 0.447265625, "learning_rate": 2.6174936811658547e-06, "loss": 0.0045, "num_tokens": 890968595.0, "reward": 0.0433349609375, "reward_std": 0.028714168816804886, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.2367647886276245, "step": 2015 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7887709688462855, "frac_reward_zero_std": 0.0, "grad_norm": 0.6436184125063011, "kl": 0.447265625, "learning_rate": 2.608287146882492e-06, "loss": 0.0045, "num_tokens": 891521603.0, "reward": 0.0440673828125, "reward_std": 0.028423871845006943, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.23427695035934448, "step": 2016 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7891622242871815, "frac_reward_zero_std": 0.0, "grad_norm": 0.748365162877499, "kl": 0.4375, "learning_rate": 2.599094403387481e-06, "loss": 0.0044, "num_tokens": 892073891.0, "reward": 0.039794921875, "reward_std": 0.03078661859035492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.318359375, "rewards/tag_count_reward/std": 0.24697156250476837, "step": 2017 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7895534797280774, "frac_reward_zero_std": 0.0, "grad_norm": 0.8638397410402681, "kl": 0.43994140625, "learning_rate": 2.5899154678318074e-06, "loss": 0.0044, "num_tokens": 892627059.0, "reward": 0.040283203125, "reward_std": 0.030012449249625206, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.24385054409503937, "step": 2018 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7899447351689735, "frac_reward_zero_std": 0.0, "grad_norm": 0.7118042610730891, "kl": 0.44091796875, "learning_rate": 2.5807503573406967e-06, "loss": 0.0044, "num_tokens": 893180195.0, "reward": 0.0404052734375, "reward_std": 0.02788589522242546, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.23063603043556213, "step": 2019 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7903359906098694, "frac_reward_zero_std": 0.0, "grad_norm": 0.7263051533547766, "kl": 0.441650390625, "learning_rate": 2.5715990890135822e-06, "loss": 0.0044, "num_tokens": 893732579.0, "reward": 0.0386962890625, "reward_std": 0.02886117994785309, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3095703125, "rewards/tag_count_reward/std": 0.23663534224033356, "step": 2020 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7907272460507654, "frac_reward_zero_std": 0.0, "grad_norm": 0.7110573719636284, "kl": 0.437744140625, "learning_rate": 2.562461679924072e-06, "loss": 0.0044, "num_tokens": 894287507.0, "reward": 0.040771484375, "reward_std": 0.029041841626167297, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.326171875, "rewards/tag_count_reward/std": 0.239604651927948, "step": 2021 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7911185014916614, "frac_reward_zero_std": 0.0, "grad_norm": 0.6856064696112117, "kl": 0.4541015625, "learning_rate": 2.5533381471199138e-06, "loss": 0.0045, "num_tokens": 894841763.0, "reward": 0.0428466796875, "reward_std": 0.028387684375047684, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.22995422780513763, "step": 2022 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7915097569325573, "frac_reward_zero_std": 0.0, "grad_norm": 0.7509266456079013, "kl": 0.44970703125, "learning_rate": 2.5442285076229723e-06, "loss": 0.0045, "num_tokens": 895394915.0, "reward": 0.039306640625, "reward_std": 0.027376454323530197, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.314453125, "rewards/tag_count_reward/std": 0.22632203996181488, "step": 2023 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7919010123734533, "frac_reward_zero_std": 0.0, "grad_norm": 0.7088063267848417, "kl": 0.452392578125, "learning_rate": 2.535132778429188e-06, "loss": 0.0045, "num_tokens": 895948915.0, "reward": 0.0430908203125, "reward_std": 0.030250439420342445, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.2466670572757721, "step": 2024 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7922922678143492, "frac_reward_zero_std": 0.0, "grad_norm": 0.5838166518980171, "kl": 0.4609375, "learning_rate": 2.5260509765085474e-06, "loss": 0.0046, "num_tokens": 896505059.0, "reward": 0.0430908203125, "reward_std": 0.027202559635043144, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.21931606531143188, "step": 2025 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7926835232552453, "frac_reward_zero_std": 0.0, "grad_norm": 0.7830302590184948, "kl": 0.465087890625, "learning_rate": 2.516983118805053e-06, "loss": 0.0047, "num_tokens": 897060035.0, "reward": 0.03857421875, "reward_std": 0.03012879379093647, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.30859375, "rewards/tag_count_reward/std": 0.2425040453672409, "step": 2026 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 205.0, "completions/mean_length": 2040.80078125, "completions/mean_terminated_length": 205.0, "completions/min_length": 205.0, "completions/min_terminated_length": 205.0, "epoch": 0.7930747786961413, "frac_reward_zero_std": 0.0, "grad_norm": 0.8676823555945105, "kl": 0.458740234375, "learning_rate": 2.5079292222366903e-06, "loss": -0.0037, "num_tokens": 897611072.0, "reward": 0.0457763671875, "reward_std": 0.030784502625465393, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.2452189177274704, "step": 2027 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7934660341370372, "frac_reward_zero_std": 0.0, "grad_norm": 0.6935718641420232, "kl": 0.4638671875, "learning_rate": 2.4988893036954045e-06, "loss": 0.0046, "num_tokens": 898166528.0, "reward": 0.0423583984375, "reward_std": 0.03130429983139038, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3388671875, "rewards/tag_count_reward/std": 0.24982953071594238, "step": 2028 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7938572895779332, "frac_reward_zero_std": 0.0, "grad_norm": 0.803397669012043, "kl": 0.47607421875, "learning_rate": 2.489863380047055e-06, "loss": 0.0048, "num_tokens": 898720480.0, "reward": 0.045654296875, "reward_std": 0.03111589513719082, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.2532191872596741, "step": 2029 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7942485450188291, "frac_reward_zero_std": 0.0, "grad_norm": 0.6715916840358321, "kl": 0.456787109375, "learning_rate": 2.4808514681313934e-06, "loss": 0.0046, "num_tokens": 899274928.0, "reward": 0.043212890625, "reward_std": 0.029611581936478615, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.23767893016338348, "step": 2030 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7946398004597252, "frac_reward_zero_std": 0.0, "grad_norm": 0.6804636822021857, "kl": 0.435791015625, "learning_rate": 2.471853584762026e-06, "loss": 0.0044, "num_tokens": 899828384.0, "reward": 0.047119140625, "reward_std": 0.02976783737540245, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.24252773821353912, "step": 2031 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7950310559006211, "frac_reward_zero_std": 0.0, "grad_norm": 0.8068152839533375, "kl": 0.431396484375, "learning_rate": 2.4628697467263916e-06, "loss": 0.0043, "num_tokens": 900383312.0, "reward": 0.0435791015625, "reward_std": 0.026997273787856102, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.22314175963401794, "step": 2032 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7954223113415171, "frac_reward_zero_std": 0.0, "grad_norm": 0.8208435984577362, "kl": 0.445068359375, "learning_rate": 2.453899970785716e-06, "loss": 0.0045, "num_tokens": 900937328.0, "reward": 0.042236328125, "reward_std": 0.029597287997603416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.23449955880641937, "step": 2033 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7958135667824131, "frac_reward_zero_std": 0.0, "grad_norm": 0.7219244351990157, "kl": 0.42333984375, "learning_rate": 2.4449442736750027e-06, "loss": 0.0042, "num_tokens": 901490432.0, "reward": 0.04248046875, "reward_std": 0.029617488384246826, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.23791244626045227, "step": 2034 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.796204822223309, "frac_reward_zero_std": 0.0, "grad_norm": 0.5945728471096581, "kl": 0.41796875, "learning_rate": 2.436002672102975e-06, "loss": 0.0042, "num_tokens": 902044624.0, "reward": 0.0406494140625, "reward_std": 0.02932111732661724, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3251953125, "rewards/tag_count_reward/std": 0.23317915201187134, "step": 2035 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.796596077664205, "frac_reward_zero_std": 0.0, "grad_norm": 0.676621633231981, "kl": 0.41162109375, "learning_rate": 2.4270751827520657e-06, "loss": 0.0041, "num_tokens": 902598592.0, "reward": 0.04443359375, "reward_std": 0.027725424617528915, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.2238464504480362, "step": 2036 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.796987333105101, "frac_reward_zero_std": 0.0, "grad_norm": 0.6334238044462864, "kl": 0.403564453125, "learning_rate": 2.418161822278374e-06, "loss": 0.004, "num_tokens": 903152176.0, "reward": 0.0467529296875, "reward_std": 0.030422812327742577, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.24554666876792908, "step": 2037 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.797378588545997, "frac_reward_zero_std": 0.0, "grad_norm": 0.6001182130896697, "kl": 0.41357421875, "learning_rate": 2.40926260731164e-06, "loss": 0.0041, "num_tokens": 903706624.0, "reward": 0.0457763671875, "reward_std": 0.030463188886642456, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24338483810424805, "step": 2038 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.797769843986893, "frac_reward_zero_std": 0.0, "grad_norm": 0.7017277409133557, "kl": 0.40478515625, "learning_rate": 2.4003775544552154e-06, "loss": 0.004, "num_tokens": 904260256.0, "reward": 0.0447998046875, "reward_std": 0.030977237969636917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.24796777963638306, "step": 2039 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7981610994277889, "frac_reward_zero_std": 0.0, "grad_norm": 0.5842288557329949, "kl": 0.40185546875, "learning_rate": 2.3915066802860265e-06, "loss": 0.004, "num_tokens": 904815312.0, "reward": 0.0399169921875, "reward_std": 0.028200779110193253, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3193359375, "rewards/tag_count_reward/std": 0.22972095012664795, "step": 2040 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7985523548686849, "frac_reward_zero_std": 0.0, "grad_norm": 0.7226166849293189, "kl": 0.394287109375, "learning_rate": 2.382650001354543e-06, "loss": 0.0039, "num_tokens": 905368736.0, "reward": 0.0384521484375, "reward_std": 0.02959529310464859, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3076171875, "rewards/tag_count_reward/std": 0.23815175890922546, "step": 2041 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7989436103095808, "frac_reward_zero_std": 0.0, "grad_norm": 0.7263743102863874, "kl": 0.392822265625, "learning_rate": 2.3738075341847634e-06, "loss": 0.0039, "num_tokens": 905924272.0, "reward": 0.04150390625, "reward_std": 0.030153803527355194, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33203125, "rewards/tag_count_reward/std": 0.24576686322689056, "step": 2042 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7993348657504769, "frac_reward_zero_std": 0.0, "grad_norm": 0.6297463476034524, "kl": 0.402099609375, "learning_rate": 2.3649792952741513e-06, "loss": 0.004, "num_tokens": 906476960.0, "reward": 0.0433349609375, "reward_std": 0.031603142619132996, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.24986018240451813, "step": 2043 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.7997261211913728, "frac_reward_zero_std": 0.0, "grad_norm": 0.8257132024042217, "kl": 0.39453125, "learning_rate": 2.3561653010936414e-06, "loss": 0.0039, "num_tokens": 907031568.0, "reward": 0.0447998046875, "reward_std": 0.02658480405807495, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.2174220085144043, "step": 2044 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8001173766322688, "frac_reward_zero_std": 0.0, "grad_norm": 0.6860495999967466, "kl": 0.4033203125, "learning_rate": 2.3473655680875864e-06, "loss": 0.004, "num_tokens": 907586304.0, "reward": 0.0438232421875, "reward_std": 0.030508557334542274, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.24632525444030762, "step": 2045 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8005086320731648, "frac_reward_zero_std": 0.0, "grad_norm": 0.8536212204865451, "kl": 0.415771484375, "learning_rate": 2.338580112673725e-06, "loss": 0.0042, "num_tokens": 908138176.0, "reward": 0.0450439453125, "reward_std": 0.02965252473950386, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.23903457820415497, "step": 2046 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8008998875140607, "frac_reward_zero_std": 0.0, "grad_norm": 1.0993225403909277, "kl": 0.428466796875, "learning_rate": 2.3298089512431744e-06, "loss": 0.0043, "num_tokens": 908690656.0, "reward": 0.0426025390625, "reward_std": 0.028146125376224518, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3408203125, "rewards/tag_count_reward/std": 0.23494620621204376, "step": 2047 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8012911429549567, "frac_reward_zero_std": 0.0, "grad_norm": 0.6993036886571147, "kl": 0.434326171875, "learning_rate": 2.3210521001603637e-06, "loss": 0.0043, "num_tokens": 909244208.0, "reward": 0.0404052734375, "reward_std": 0.026606444269418716, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.2208646982908249, "step": 2048 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8016823983958526, "frac_reward_zero_std": 0.0, "grad_norm": 0.7350590740620101, "kl": 0.439208984375, "learning_rate": 2.3123095757630344e-06, "loss": 0.0044, "num_tokens": 909797984.0, "reward": 0.045166015625, "reward_std": 0.02994152344763279, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2411341667175293, "step": 2049 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8020736538367487, "frac_reward_zero_std": 0.0, "grad_norm": 0.7879045552140572, "kl": 0.436279296875, "learning_rate": 2.3035813943621997e-06, "loss": 0.0044, "num_tokens": 910354064.0, "reward": 0.041259765625, "reward_std": 0.02682347409427166, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.330078125, "rewards/tag_count_reward/std": 0.21678034961223602, "step": 2050 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8024649092776447, "frac_reward_zero_std": 0.0, "grad_norm": 0.7562579186242574, "kl": 0.45361328125, "learning_rate": 2.2948675722421086e-06, "loss": 0.0045, "num_tokens": 910908640.0, "reward": 0.0443115234375, "reward_std": 0.029118601232767105, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.23860159516334534, "step": 2051 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8028561647185406, "frac_reward_zero_std": 0.0, "grad_norm": 0.8971564119081452, "kl": 0.46728515625, "learning_rate": 2.2861681256602187e-06, "loss": 0.0047, "num_tokens": 911461600.0, "reward": 0.0440673828125, "reward_std": 0.02931841090321541, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.23945076763629913, "step": 2052 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8032474201594366, "frac_reward_zero_std": 0.0, "grad_norm": 0.8045766078738974, "kl": 0.46826171875, "learning_rate": 2.2774830708471774e-06, "loss": 0.0047, "num_tokens": 912017216.0, "reward": 0.0458984375, "reward_std": 0.027947209775447845, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.22565264999866486, "step": 2053 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8036386756003325, "frac_reward_zero_std": 0.0, "grad_norm": 0.9366336962414069, "kl": 0.47314453125, "learning_rate": 2.2688124240067622e-06, "loss": 0.0047, "num_tokens": 912570224.0, "reward": 0.04736328125, "reward_std": 0.027532363310456276, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.22898834943771362, "step": 2054 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8040299310412286, "frac_reward_zero_std": 0.0, "grad_norm": 0.9206805554465611, "kl": 0.47119140625, "learning_rate": 2.2601562013158897e-06, "loss": 0.0047, "num_tokens": 913123056.0, "reward": 0.04638671875, "reward_std": 0.02992488071322441, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24451708793640137, "step": 2055 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8044211864821245, "frac_reward_zero_std": 0.0, "grad_norm": 1.1255244724529085, "kl": 0.47802734375, "learning_rate": 2.251514418924551e-06, "loss": 0.0048, "num_tokens": 913676400.0, "reward": 0.042236328125, "reward_std": 0.027849683538079262, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.22921401262283325, "step": 2056 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8048124419230205, "frac_reward_zero_std": 0.0, "grad_norm": 0.7638857034446982, "kl": 0.461181640625, "learning_rate": 2.2428870929558012e-06, "loss": 0.0046, "num_tokens": 914230912.0, "reward": 0.041259765625, "reward_std": 0.02740607038140297, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.330078125, "rewards/tag_count_reward/std": 0.22014610469341278, "step": 2057 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8052036973639165, "frac_reward_zero_std": 0.0, "grad_norm": 1.1379005990989903, "kl": 0.4638671875, "learning_rate": 2.234274239505727e-06, "loss": 0.0046, "num_tokens": 914785888.0, "reward": 0.0435791015625, "reward_std": 0.028262391686439514, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.22856776416301727, "step": 2058 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8055949528048124, "frac_reward_zero_std": 0.0, "grad_norm": 0.7799139312480288, "kl": 0.468994140625, "learning_rate": 2.2256758746434114e-06, "loss": 0.0047, "num_tokens": 915339776.0, "reward": 0.042724609375, "reward_std": 0.029764549806714058, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.341796875, "rewards/tag_count_reward/std": 0.2402431219816208, "step": 2059 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8059862082457084, "frac_reward_zero_std": 0.0, "grad_norm": 0.6982987881177125, "kl": 0.4580078125, "learning_rate": 2.2170920144108965e-06, "loss": 0.0046, "num_tokens": 915893952.0, "reward": 0.0467529296875, "reward_std": 0.03034541755914688, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2465428113937378, "step": 2060 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8063774636866043, "frac_reward_zero_std": 0.0, "grad_norm": 0.6038728746984567, "kl": 0.45068359375, "learning_rate": 2.2085226748231792e-06, "loss": 0.0045, "num_tokens": 916445984.0, "reward": 0.04443359375, "reward_std": 0.030027836561203003, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.24072882533073425, "step": 2061 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8067687191275004, "frac_reward_zero_std": 0.0, "grad_norm": 0.7984468203638745, "kl": 0.442626953125, "learning_rate": 2.199967871868154e-06, "loss": 0.0044, "num_tokens": 916998800.0, "reward": 0.043212890625, "reward_std": 0.029875677078962326, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.23870791494846344, "step": 2062 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8071599745683964, "frac_reward_zero_std": 0.0, "grad_norm": 0.8208818110571799, "kl": 0.4423828125, "learning_rate": 2.1914276215065956e-06, "loss": 0.0044, "num_tokens": 917551520.0, "reward": 0.043701171875, "reward_std": 0.03084094263613224, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.2501761019229889, "step": 2063 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8075512300092923, "frac_reward_zero_std": 0.0, "grad_norm": 0.5876848113304058, "kl": 0.431640625, "learning_rate": 2.1829019396721374e-06, "loss": 0.0043, "num_tokens": 918106368.0, "reward": 0.0408935546875, "reward_std": 0.02823980338871479, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3271484375, "rewards/tag_count_reward/std": 0.23253773152828217, "step": 2064 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8079424854501883, "frac_reward_zero_std": 0.0, "grad_norm": 0.9152956786479124, "kl": 0.4384765625, "learning_rate": 2.1743908422712135e-06, "loss": 0.0044, "num_tokens": 918659216.0, "reward": 0.0460205078125, "reward_std": 0.029126105830073357, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.23629525303840637, "step": 2065 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8083337408910842, "frac_reward_zero_std": 0.0, "grad_norm": 0.6729919235072082, "kl": 0.42822265625, "learning_rate": 2.1658943451830663e-06, "loss": 0.0043, "num_tokens": 919211264.0, "reward": 0.0450439453125, "reward_std": 0.031066369265317917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.2500593662261963, "step": 2066 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8087249963319803, "frac_reward_zero_std": 0.0, "grad_norm": 2.2291020737438823, "kl": 0.422607421875, "learning_rate": 2.1574124642596882e-06, "loss": 0.0042, "num_tokens": 919764944.0, "reward": 0.0458984375, "reward_std": 0.029615754261612892, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23833060264587402, "step": 2067 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8091162517728763, "frac_reward_zero_std": 0.0, "grad_norm": 0.6963416376821024, "kl": 0.42822265625, "learning_rate": 2.1489452153258016e-06, "loss": 0.0043, "num_tokens": 920317312.0, "reward": 0.0430908203125, "reward_std": 0.028798941522836685, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.23755604028701782, "step": 2068 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8095075072137722, "frac_reward_zero_std": 0.0, "grad_norm": 0.5710824029719164, "kl": 0.423095703125, "learning_rate": 2.1404926141788385e-06, "loss": 0.0042, "num_tokens": 920868848.0, "reward": 0.0419921875, "reward_std": 0.028903376311063766, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3359375, "rewards/tag_count_reward/std": 0.23101969063282013, "step": 2069 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8098987626546682, "frac_reward_zero_std": 0.0, "grad_norm": 0.5677937033486329, "kl": 0.42431640625, "learning_rate": 2.1320546765888927e-06, "loss": 0.0042, "num_tokens": 921420640.0, "reward": 0.044677734375, "reward_std": 0.02954207733273506, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.23677489161491394, "step": 2070 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8102900180955641, "frac_reward_zero_std": 0.0, "grad_norm": 0.7169029688897716, "kl": 0.416015625, "learning_rate": 2.123631418298705e-06, "loss": 0.0042, "num_tokens": 921975728.0, "reward": 0.041748046875, "reward_std": 0.030251307412981987, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.2430955320596695, "step": 2071 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8106812735364601, "frac_reward_zero_std": 0.0, "grad_norm": 0.7046860300949399, "kl": 0.41015625, "learning_rate": 2.1152228550236264e-06, "loss": 0.0041, "num_tokens": 922531392.0, "reward": 0.0426025390625, "reward_std": 0.02841849997639656, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3408203125, "rewards/tag_count_reward/std": 0.2296709418296814, "step": 2072 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.811072528977356, "frac_reward_zero_std": 0.0, "grad_norm": 0.7160251139174668, "kl": 0.417236328125, "learning_rate": 2.1068290024515925e-06, "loss": 0.0042, "num_tokens": 923085536.0, "reward": 0.0394287109375, "reward_std": 0.02946256659924984, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3154296875, "rewards/tag_count_reward/std": 0.23715266585350037, "step": 2073 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8114637844182521, "frac_reward_zero_std": 0.0, "grad_norm": 0.6743180245191037, "kl": 0.421875, "learning_rate": 2.098449876243096e-06, "loss": 0.0042, "num_tokens": 923639504.0, "reward": 0.042724609375, "reward_std": 0.02845441736280918, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.341796875, "rewards/tag_count_reward/std": 0.23087875545024872, "step": 2074 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8118550398591481, "frac_reward_zero_std": 0.0, "grad_norm": 0.7744747077654335, "kl": 0.418701171875, "learning_rate": 2.090085492031151e-06, "loss": 0.0042, "num_tokens": 924192000.0, "reward": 0.044677734375, "reward_std": 0.030698347836732864, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.25664424896240234, "step": 2075 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.812246295300044, "frac_reward_zero_std": 0.0, "grad_norm": 0.6529981759899606, "kl": 0.4189453125, "learning_rate": 2.0817358654212662e-06, "loss": 0.0042, "num_tokens": 924745808.0, "reward": 0.04345703125, "reward_std": 0.029289616271853447, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.2440153807401657, "step": 2076 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.81263755074094, "frac_reward_zero_std": 0.0, "grad_norm": 0.6861752919193732, "kl": 0.42138671875, "learning_rate": 2.0734010119914193e-06, "loss": 0.0042, "num_tokens": 925298944.0, "reward": 0.044677734375, "reward_std": 0.030824821442365646, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24986976385116577, "step": 2077 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8130288061818359, "frac_reward_zero_std": 0.0, "grad_norm": 0.6004345319441764, "kl": 0.42236328125, "learning_rate": 2.065080947292025e-06, "loss": 0.0042, "num_tokens": 925852672.0, "reward": 0.0439453125, "reward_std": 0.028314780443906784, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.22888797521591187, "step": 2078 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.813420061622732, "frac_reward_zero_std": 0.0, "grad_norm": 0.692615948536971, "kl": 0.413818359375, "learning_rate": 2.0567756868459043e-06, "loss": 0.0041, "num_tokens": 926406272.0, "reward": 0.048828125, "reward_std": 0.028399527072906494, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.2284860759973526, "step": 2079 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.813811317063628, "frac_reward_zero_std": 0.0, "grad_norm": 0.5857183550361594, "kl": 0.42138671875, "learning_rate": 2.0484852461482642e-06, "loss": 0.0042, "num_tokens": 926959664.0, "reward": 0.0438232421875, "reward_std": 0.03109116293489933, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.24731826782226562, "step": 2080 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8142025725045239, "frac_reward_zero_std": 0.0, "grad_norm": 0.6069844105493958, "kl": 0.4189453125, "learning_rate": 2.0402096406666562e-06, "loss": 0.0042, "num_tokens": 927513936.0, "reward": 0.0399169921875, "reward_std": 0.027537869289517403, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3193359375, "rewards/tag_count_reward/std": 0.2199089676141739, "step": 2081 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8145938279454199, "frac_reward_zero_std": 0.0, "grad_norm": 0.686628506441597, "kl": 0.42333984375, "learning_rate": 2.0319488858409552e-06, "loss": 0.0042, "num_tokens": 928067216.0, "reward": 0.043212890625, "reward_std": 0.028286162763834, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.2345648854970932, "step": 2082 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8149850833863158, "frac_reward_zero_std": 0.0, "grad_norm": 0.6382776965549459, "kl": 0.4111328125, "learning_rate": 2.023702997083329e-06, "loss": 0.0041, "num_tokens": 928621520.0, "reward": 0.044189453125, "reward_std": 0.02713417261838913, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.22908031940460205, "step": 2083 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8153763388272118, "frac_reward_zero_std": 0.0, "grad_norm": 0.6250996021994698, "kl": 0.416259765625, "learning_rate": 2.015471989778208e-06, "loss": 0.0042, "num_tokens": 929175248.0, "reward": 0.043701171875, "reward_std": 0.029271956533193588, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.23709815740585327, "step": 2084 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8157675942681077, "frac_reward_zero_std": 0.0, "grad_norm": 0.6918951303949367, "kl": 0.416748046875, "learning_rate": 2.007255879282266e-06, "loss": 0.0042, "num_tokens": 929726976.0, "reward": 0.0443115234375, "reward_std": 0.029522784054279327, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.2340807020664215, "step": 2085 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8161588497090038, "frac_reward_zero_std": 0.0, "grad_norm": 0.7616956767718799, "kl": 0.416259765625, "learning_rate": 1.9990546809243725e-06, "loss": 0.0042, "num_tokens": 930280096.0, "reward": 0.0445556640625, "reward_std": 0.02824837900698185, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.23040345311164856, "step": 2086 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8165501051498998, "frac_reward_zero_std": 0.0, "grad_norm": 0.9025013370120997, "kl": 0.43798828125, "learning_rate": 1.9908684100055843e-06, "loss": 0.0044, "num_tokens": 930834416.0, "reward": 0.0447998046875, "reward_std": 0.028723478317260742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.22948744893074036, "step": 2087 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8169413605907957, "frac_reward_zero_std": 0.0, "grad_norm": 0.792578853075689, "kl": 0.43408203125, "learning_rate": 1.982697081799101e-06, "loss": 0.0043, "num_tokens": 931387872.0, "reward": 0.0423583984375, "reward_std": 0.030870534479618073, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3388671875, "rewards/tag_count_reward/std": 0.2508086860179901, "step": 2088 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8173326160316917, "frac_reward_zero_std": 0.0, "grad_norm": 0.744715443034918, "kl": 0.4306640625, "learning_rate": 1.974540711550248e-06, "loss": 0.0043, "num_tokens": 931939568.0, "reward": 0.040771484375, "reward_std": 0.030016470700502396, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.326171875, "rewards/tag_count_reward/std": 0.24062541127204895, "step": 2089 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8177238714725876, "frac_reward_zero_std": 0.0, "grad_norm": 0.673632712844762, "kl": 0.419921875, "learning_rate": 1.9663993144764393e-06, "loss": 0.0042, "num_tokens": 932496608.0, "reward": 0.04345703125, "reward_std": 0.02891646698117256, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.23688000440597534, "step": 2090 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8181151269134836, "frac_reward_zero_std": 0.0, "grad_norm": 0.6858644518932429, "kl": 0.429931640625, "learning_rate": 1.9582729057671622e-06, "loss": 0.0043, "num_tokens": 933051168.0, "reward": 0.042236328125, "reward_std": 0.02885933592915535, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.23239977657794952, "step": 2091 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8185063823543797, "frac_reward_zero_std": 0.0, "grad_norm": 0.6586675231331138, "kl": 0.4150390625, "learning_rate": 1.95016150058393e-06, "loss": 0.0042, "num_tokens": 933603520.0, "reward": 0.0411376953125, "reward_std": 0.030266255140304565, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3291015625, "rewards/tag_count_reward/std": 0.24423304200172424, "step": 2092 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8188976377952756, "frac_reward_zero_std": 0.0, "grad_norm": 1.0751384879801709, "kl": 0.418701171875, "learning_rate": 1.9420651140602697e-06, "loss": 0.0042, "num_tokens": 934158080.0, "reward": 0.04052734375, "reward_std": 0.03008616715669632, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.32421875, "rewards/tag_count_reward/std": 0.24123737215995789, "step": 2093 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8192888932361716, "frac_reward_zero_std": 0.0, "grad_norm": 0.6224825498083925, "kl": 0.416259765625, "learning_rate": 1.9339837613016842e-06, "loss": 0.0042, "num_tokens": 934714496.0, "reward": 0.0479736328125, "reward_std": 0.02805943787097931, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.22885242104530334, "step": 2094 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8196801486770675, "frac_reward_zero_std": 0.0, "grad_norm": 0.7058305712677256, "kl": 0.418212890625, "learning_rate": 1.9259174573856276e-06, "loss": 0.0042, "num_tokens": 935267984.0, "reward": 0.0423583984375, "reward_std": 0.0278141088783741, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3388671875, "rewards/tag_count_reward/std": 0.22722341120243073, "step": 2095 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8200714041179635, "frac_reward_zero_std": 0.0, "grad_norm": 0.6533522438869726, "kl": 0.4228515625, "learning_rate": 1.917866217361485e-06, "loss": 0.0042, "num_tokens": 935823568.0, "reward": 0.044677734375, "reward_std": 0.029987577348947525, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24391335248947144, "step": 2096 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8204626595588594, "frac_reward_zero_std": 0.0, "grad_norm": 0.6768859017397626, "kl": 0.419677734375, "learning_rate": 1.9098300562505266e-06, "loss": 0.0042, "num_tokens": 936378720.0, "reward": 0.041748046875, "reward_std": 0.029609717428684235, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.24709556996822357, "step": 2097 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8208539149997555, "frac_reward_zero_std": 0.0, "grad_norm": 0.6508234560008012, "kl": 0.42578125, "learning_rate": 1.9018089890458946e-06, "loss": 0.0043, "num_tokens": 936932592.0, "reward": 0.0433349609375, "reward_std": 0.028881367295980453, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.2367647886276245, "step": 2098 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8212451704406515, "frac_reward_zero_std": 0.0, "grad_norm": 0.6231804096309911, "kl": 0.428955078125, "learning_rate": 1.893803030712571e-06, "loss": 0.0043, "num_tokens": 937485392.0, "reward": 0.0411376953125, "reward_std": 0.029907630756497383, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3291015625, "rewards/tag_count_reward/std": 0.24423304200172424, "step": 2099 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8216364258815474, "frac_reward_zero_std": 0.0, "grad_norm": 0.6720355638948959, "kl": 0.43017578125, "learning_rate": 1.8858121961873444e-06, "loss": 0.0043, "num_tokens": 938039328.0, "reward": 0.048828125, "reward_std": 0.027484161779284477, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.22415418922901154, "step": 2100 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8220276813224434, "frac_reward_zero_std": 0.0, "grad_norm": 0.6866540082604644, "kl": 0.426025390625, "learning_rate": 1.8778365003787934e-06, "loss": 0.0043, "num_tokens": 938594048.0, "reward": 0.0439453125, "reward_std": 0.03095218725502491, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.2464204579591751, "step": 2101 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8224189367633393, "frac_reward_zero_std": 0.0, "grad_norm": 0.7043677942487935, "kl": 0.434326171875, "learning_rate": 1.8698759581672487e-06, "loss": 0.0043, "num_tokens": 939145520.0, "reward": 0.04296875, "reward_std": 0.031400591135025024, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.25340813398361206, "step": 2102 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8228101922042353, "frac_reward_zero_std": 0.0, "grad_norm": 0.7323289866100576, "kl": 0.427490234375, "learning_rate": 1.8619305844047664e-06, "loss": 0.0043, "num_tokens": 939699152.0, "reward": 0.043701171875, "reward_std": 0.028307706117630005, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.22867874801158905, "step": 2103 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8232014476451314, "frac_reward_zero_std": 0.0, "grad_norm": 0.7059136444188298, "kl": 0.419921875, "learning_rate": 1.8540003939151107e-06, "loss": 0.0042, "num_tokens": 940254720.0, "reward": 0.04638671875, "reward_std": 0.029845640063285828, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24551741778850555, "step": 2104 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8235927030860273, "frac_reward_zero_std": 0.0, "grad_norm": 0.5993663343838723, "kl": 0.421142578125, "learning_rate": 1.8460854014937068e-06, "loss": 0.0042, "num_tokens": 940806736.0, "reward": 0.043212890625, "reward_std": 0.028413057327270508, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.2345648854970932, "step": 2105 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8239839585269233, "frac_reward_zero_std": 0.0, "grad_norm": 0.6089954359630396, "kl": 0.416259765625, "learning_rate": 1.838185621907631e-06, "loss": 0.0042, "num_tokens": 941362192.0, "reward": 0.0426025390625, "reward_std": 0.027734126895666122, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3408203125, "rewards/tag_count_reward/std": 0.22644679248332977, "step": 2106 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8243752139678192, "frac_reward_zero_std": 0.0, "grad_norm": 0.6253815679614663, "kl": 0.40478515625, "learning_rate": 1.8303010698955803e-06, "loss": 0.004, "num_tokens": 941917376.0, "reward": 0.0430908203125, "reward_std": 0.028014864772558212, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.23548349738121033, "step": 2107 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8247664694087152, "frac_reward_zero_std": 0.0, "grad_norm": 0.6137193664322789, "kl": 0.412353515625, "learning_rate": 1.8224317601678365e-06, "loss": 0.0041, "num_tokens": 942470464.0, "reward": 0.044677734375, "reward_std": 0.029308632016181946, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.23677489161491394, "step": 2108 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8251577248496111, "frac_reward_zero_std": 0.0, "grad_norm": 0.8599945986682973, "kl": 0.40771484375, "learning_rate": 1.8145777074062432e-06, "loss": 0.0041, "num_tokens": 943023936.0, "reward": 0.045166015625, "reward_std": 0.030259529128670692, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.24315853416919708, "step": 2109 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8255489802905072, "frac_reward_zero_std": 0.0, "grad_norm": 0.6419359678198375, "kl": 0.401611328125, "learning_rate": 1.8067389262641821e-06, "loss": 0.004, "num_tokens": 943579072.0, "reward": 0.0435791015625, "reward_std": 0.03014005720615387, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.24412326514720917, "step": 2110 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8259402357314032, "frac_reward_zero_std": 0.0, "grad_norm": 0.9113574948394378, "kl": 0.396728515625, "learning_rate": 1.798915431366538e-06, "loss": 0.004, "num_tokens": 944133216.0, "reward": 0.0440673828125, "reward_std": 0.02964569255709648, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.2435106784105301, "step": 2111 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8263314911722991, "frac_reward_zero_std": 0.0, "grad_norm": 0.6583680414504623, "kl": 0.401123046875, "learning_rate": 1.791107237309685e-06, "loss": 0.004, "num_tokens": 944686496.0, "reward": 0.0462646484375, "reward_std": 0.031362902373075485, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.25141867995262146, "step": 2112 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8267227466131951, "frac_reward_zero_std": 0.0, "grad_norm": 0.6656596121643341, "kl": 0.3955078125, "learning_rate": 1.783314358661441e-06, "loss": 0.004, "num_tokens": 945239296.0, "reward": 0.04541015625, "reward_std": 0.028936944901943207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.23296760022640228, "step": 2113 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.827114002054091, "frac_reward_zero_std": 0.0, "grad_norm": 0.6052985516050069, "kl": 0.398681640625, "learning_rate": 1.7755368099610504e-06, "loss": 0.004, "num_tokens": 945793712.0, "reward": 0.0396728515625, "reward_std": 0.03207258880138397, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3173828125, "rewards/tag_count_reward/std": 0.2564856708049774, "step": 2114 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.827505257494987, "frac_reward_zero_std": 0.0, "grad_norm": 0.6080276434392813, "kl": 0.402099609375, "learning_rate": 1.767774605719167e-06, "loss": 0.004, "num_tokens": 946346768.0, "reward": 0.044677734375, "reward_std": 0.02900758385658264, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.23677489161491394, "step": 2115 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8278965129358831, "frac_reward_zero_std": 0.0, "grad_norm": 0.7056972991847066, "kl": 0.3955078125, "learning_rate": 1.7600277604178007e-06, "loss": 0.004, "num_tokens": 946902016.0, "reward": 0.044677734375, "reward_std": 0.029900195077061653, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24189528822898865, "step": 2116 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.828287768376779, "frac_reward_zero_std": 0.0, "grad_norm": 0.5366047862599092, "kl": 0.38720703125, "learning_rate": 1.7522962885103145e-06, "loss": 0.0039, "num_tokens": 947455856.0, "reward": 0.043212890625, "reward_std": 0.030488092452287674, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.24278025329113007, "step": 2117 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.828679023817675, "frac_reward_zero_std": 0.0, "grad_norm": 0.5084540972804327, "kl": 0.392333984375, "learning_rate": 1.7445802044213934e-06, "loss": 0.0039, "num_tokens": 948011888.0, "reward": 0.0445556640625, "reward_std": 0.028342558071017265, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.2293372005224228, "step": 2118 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8290702792585709, "frac_reward_zero_std": 0.0, "grad_norm": 0.6257305272262472, "kl": 0.391845703125, "learning_rate": 1.7368795225470049e-06, "loss": 0.0039, "num_tokens": 948564032.0, "reward": 0.0450439453125, "reward_std": 0.028817273676395416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.23385155200958252, "step": 2119 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8294615346994669, "frac_reward_zero_std": 0.0, "grad_norm": 0.523351277093096, "kl": 0.401611328125, "learning_rate": 1.7291942572543806e-06, "loss": 0.004, "num_tokens": 949119984.0, "reward": 0.04638671875, "reward_std": 0.029493950307369232, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.23532284796237946, "step": 2120 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8298527901403628, "frac_reward_zero_std": 0.0, "grad_norm": 0.6162681521539706, "kl": 0.38427734375, "learning_rate": 1.7215244228820027e-06, "loss": 0.0038, "num_tokens": 949672832.0, "reward": 0.0455322265625, "reward_std": 0.03165017068386078, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.25220951437950134, "step": 2121 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8302440455812589, "frac_reward_zero_std": 0.0, "grad_norm": 0.5613037016404667, "kl": 0.386474609375, "learning_rate": 1.713870033739541e-06, "loss": 0.0039, "num_tokens": 950227328.0, "reward": 0.041748046875, "reward_std": 0.0307474248111248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.24709556996822357, "step": 2122 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8306353010221549, "frac_reward_zero_std": 0.0, "grad_norm": 0.5879676089400846, "kl": 0.391845703125, "learning_rate": 1.7062311041078695e-06, "loss": 0.0039, "num_tokens": 950781616.0, "reward": 0.0438232421875, "reward_std": 0.030026983469724655, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.244327113032341, "step": 2123 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8310265564630508, "frac_reward_zero_std": 0.0, "grad_norm": 0.6761991204799122, "kl": 0.400390625, "learning_rate": 1.6986076482390078e-06, "loss": 0.004, "num_tokens": 951334480.0, "reward": 0.0396728515625, "reward_std": 0.027721254155039787, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3173828125, "rewards/tag_count_reward/std": 0.22272947430610657, "step": 2124 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8314178119039468, "frac_reward_zero_std": 0.0, "grad_norm": 1.1004505587976692, "kl": 0.446533203125, "learning_rate": 1.6909996803561101e-06, "loss": 0.0045, "num_tokens": 951887936.0, "reward": 0.0435791015625, "reward_std": 0.03112279251217842, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.2496301680803299, "step": 2125 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8318090673448427, "frac_reward_zero_std": 0.0, "grad_norm": 0.6041572727394978, "kl": 0.392578125, "learning_rate": 1.6834072146534387e-06, "loss": 0.0039, "num_tokens": 952442848.0, "reward": 0.0396728515625, "reward_std": 0.0293120089918375, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3173828125, "rewards/tag_count_reward/std": 0.23763662576675415, "step": 2126 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8322003227857387, "frac_reward_zero_std": 0.0, "grad_norm": 0.5508553794899292, "kl": 0.398681640625, "learning_rate": 1.6758302652963176e-06, "loss": 0.004, "num_tokens": 952995440.0, "reward": 0.0433349609375, "reward_std": 0.02861533686518669, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.22833307087421417, "step": 2127 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8325915782266348, "frac_reward_zero_std": 0.0, "grad_norm": 0.7313263495269208, "kl": 0.3994140625, "learning_rate": 1.6682688464211427e-06, "loss": 0.004, "num_tokens": 953551504.0, "reward": 0.0452880859375, "reward_std": 0.028082840144634247, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.22866827249526978, "step": 2128 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8329828336675307, "frac_reward_zero_std": 0.0, "grad_norm": 0.716154053529922, "kl": 0.400634765625, "learning_rate": 1.6607229721353202e-06, "loss": 0.004, "num_tokens": 954106528.0, "reward": 0.049560546875, "reward_std": 0.03065500408411026, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.247591033577919, "step": 2129 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8333740891084267, "frac_reward_zero_std": 0.0, "grad_norm": 1.0430631016874203, "kl": 0.4091796875, "learning_rate": 1.6531926565172573e-06, "loss": 0.0041, "num_tokens": 954660880.0, "reward": 0.042236328125, "reward_std": 0.02884693071246147, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.23864373564720154, "step": 2130 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8337653445493226, "frac_reward_zero_std": 0.0, "grad_norm": 0.5982094792068976, "kl": 0.41748046875, "learning_rate": 1.6456779136163404e-06, "loss": 0.0042, "num_tokens": 955213376.0, "reward": 0.0450439453125, "reward_std": 0.0290562491863966, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.23800699412822723, "step": 2131 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8341565999902186, "frac_reward_zero_std": 0.0, "grad_norm": 0.6918022853899252, "kl": 0.417236328125, "learning_rate": 1.638178757452894e-06, "loss": 0.0042, "num_tokens": 955766976.0, "reward": 0.04248046875, "reward_std": 0.029265031218528748, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.23584304749965668, "step": 2132 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8345478554311145, "frac_reward_zero_std": 0.0, "grad_norm": 0.7076251300805463, "kl": 0.444091796875, "learning_rate": 1.6306952020181577e-06, "loss": 0.0044, "num_tokens": 956322880.0, "reward": 0.0474853515625, "reward_std": 0.030968744307756424, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.25141867995262146, "step": 2133 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8349391108720106, "frac_reward_zero_std": 0.0, "grad_norm": 0.6607468791138671, "kl": 0.425048828125, "learning_rate": 1.6232272612742806e-06, "loss": 0.0043, "num_tokens": 956876992.0, "reward": 0.0445556640625, "reward_std": 0.027240067720413208, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.22392983734607697, "step": 2134 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8353303663129066, "frac_reward_zero_std": 0.0, "grad_norm": 0.7503383018764018, "kl": 0.439697265625, "learning_rate": 1.6157749491542662e-06, "loss": 0.0044, "num_tokens": 957430256.0, "reward": 0.0445556640625, "reward_std": 0.029311949387192726, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.23978638648986816, "step": 2135 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8357216217538025, "frac_reward_zero_std": 0.0, "grad_norm": 0.8533562032200527, "kl": 0.43359375, "learning_rate": 1.6083382795619606e-06, "loss": 0.0043, "num_tokens": 957983232.0, "reward": 0.0404052734375, "reward_std": 0.0278190728276968, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.22197164595127106, "step": 2136 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8361128771946985, "frac_reward_zero_std": 0.0, "grad_norm": 0.687736034821832, "kl": 0.43603515625, "learning_rate": 1.6009172663720352e-06, "loss": 0.0044, "num_tokens": 958536752.0, "reward": 0.0438232421875, "reward_std": 0.030033990740776062, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.2412988692522049, "step": 2137 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8365041326355944, "frac_reward_zero_std": 0.0, "grad_norm": 0.7429482682499823, "kl": 0.44970703125, "learning_rate": 1.593511923429939e-06, "loss": 0.0045, "num_tokens": 959088752.0, "reward": 0.0479736328125, "reward_std": 0.029826883226633072, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.24438980221748352, "step": 2138 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8368953880764904, "frac_reward_zero_std": 0.0, "grad_norm": 0.6433013690225309, "kl": 0.439697265625, "learning_rate": 1.5861222645518926e-06, "loss": 0.0044, "num_tokens": 959642944.0, "reward": 0.04443359375, "reward_std": 0.028974121436476707, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.23244096338748932, "step": 2139 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8372866435173865, "frac_reward_zero_std": 0.0, "grad_norm": 0.7727102645215496, "kl": 0.447265625, "learning_rate": 1.578748303524852e-06, "loss": 0.0045, "num_tokens": 960198608.0, "reward": 0.0421142578125, "reward_std": 0.02937050350010395, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3369140625, "rewards/tag_count_reward/std": 0.23538589477539062, "step": 2140 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8376778989582824, "frac_reward_zero_std": 0.0, "grad_norm": 0.6302369998046717, "kl": 0.447509765625, "learning_rate": 1.571390054106482e-06, "loss": 0.0045, "num_tokens": 960752288.0, "reward": 0.0472412109375, "reward_std": 0.030783021822571754, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.24850773811340332, "step": 2141 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8380691543991784, "frac_reward_zero_std": 0.0, "grad_norm": 0.6881216143082336, "kl": 0.458251953125, "learning_rate": 1.5640475300251423e-06, "loss": 0.0046, "num_tokens": 961305696.0, "reward": 0.0440673828125, "reward_std": 0.030140992254018784, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.24451513588428497, "step": 2142 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8384604098400743, "frac_reward_zero_std": 0.0, "grad_norm": 0.8617435597398837, "kl": 0.448974609375, "learning_rate": 1.5567207449798517e-06, "loss": 0.0045, "num_tokens": 961857184.0, "reward": 0.042236328125, "reward_std": 0.030171433463692665, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.24572789669036865, "step": 2143 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8388516652809703, "frac_reward_zero_std": 0.0, "grad_norm": 0.7445342564279648, "kl": 0.445068359375, "learning_rate": 1.549409712640253e-06, "loss": 0.0045, "num_tokens": 962412608.0, "reward": 0.04638671875, "reward_std": 0.03206964582204819, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.25911685824394226, "step": 2144 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8392429207218662, "frac_reward_zero_std": 0.0, "grad_norm": 0.8270526332088134, "kl": 0.442626953125, "learning_rate": 1.5421144466466164e-06, "loss": 0.0044, "num_tokens": 962966560.0, "reward": 0.0435791015625, "reward_std": 0.029999934136867523, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.2410924732685089, "step": 2145 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8396341761627623, "frac_reward_zero_std": 0.0, "grad_norm": 0.8283941497259624, "kl": 0.46728515625, "learning_rate": 1.5348349606097857e-06, "loss": 0.0047, "num_tokens": 963521840.0, "reward": 0.04736328125, "reward_std": 0.030757615342736244, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.24351264536380768, "step": 2146 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8400254316036583, "frac_reward_zero_std": 0.0, "grad_norm": 0.6988734815103396, "kl": 0.445556640625, "learning_rate": 1.5275712681111643e-06, "loss": 0.0045, "num_tokens": 964076512.0, "reward": 0.04345703125, "reward_std": 0.02839059568941593, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.23058828711509705, "step": 2147 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8404166870445542, "frac_reward_zero_std": 0.0, "grad_norm": 0.6651627855324725, "kl": 0.44189453125, "learning_rate": 1.5203233827026964e-06, "loss": 0.0044, "num_tokens": 964632032.0, "reward": 0.043212890625, "reward_std": 0.030501501634716988, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.24579022824764252, "step": 2148 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8408079424854502, "frac_reward_zero_std": 0.0, "grad_norm": 0.6773749395571111, "kl": 0.440673828125, "learning_rate": 1.5130913179068275e-06, "loss": 0.0044, "num_tokens": 965186272.0, "reward": 0.0404052734375, "reward_std": 0.026924602687358856, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.22197164595127106, "step": 2149 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8411991979263461, "frac_reward_zero_std": 0.0, "grad_norm": 0.7263694864528407, "kl": 0.44189453125, "learning_rate": 1.5058750872164885e-06, "loss": 0.0044, "num_tokens": 965741376.0, "reward": 0.041748046875, "reward_std": 0.029678411781787872, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.2359323352575302, "step": 2150 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8415904533672421, "frac_reward_zero_std": 0.0, "grad_norm": 0.7266275105569261, "kl": 0.442138671875, "learning_rate": 1.498674704095071e-06, "loss": 0.0044, "num_tokens": 966294768.0, "reward": 0.042236328125, "reward_std": 0.028986990451812744, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.2355424463748932, "step": 2151 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8419817088081382, "frac_reward_zero_std": 0.0, "grad_norm": 0.8375996411007963, "kl": 0.45654296875, "learning_rate": 1.4914901819763938e-06, "loss": 0.0046, "num_tokens": 966846672.0, "reward": 0.044921875, "reward_std": 0.02775801345705986, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.23483411967754364, "step": 2152 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8423729642490341, "frac_reward_zero_std": 0.0, "grad_norm": 0.6750529691370182, "kl": 0.444580078125, "learning_rate": 1.4843215342646922e-06, "loss": 0.0044, "num_tokens": 967400032.0, "reward": 0.0428466796875, "reward_std": 0.029046323150396347, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.23729796707630157, "step": 2153 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8427642196899301, "frac_reward_zero_std": 0.0, "grad_norm": 0.7442237537571614, "kl": 0.4482421875, "learning_rate": 1.4771687743345787e-06, "loss": 0.0045, "num_tokens": 967953296.0, "reward": 0.0460205078125, "reward_std": 0.029957065358757973, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.24040846526622772, "step": 2154 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.843155475130826, "frac_reward_zero_std": 0.0, "grad_norm": 0.8592431123832857, "kl": 0.434814453125, "learning_rate": 1.4700319155310227e-06, "loss": 0.0043, "num_tokens": 968507008.0, "reward": 0.0430908203125, "reward_std": 0.029965464025735855, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.2506101131439209, "step": 2155 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.843546730571722, "frac_reward_zero_std": 0.0, "grad_norm": 0.8573925564790198, "kl": 0.44873046875, "learning_rate": 1.4629109711693313e-06, "loss": 0.0045, "num_tokens": 969060480.0, "reward": 0.0457763671875, "reward_std": 0.03051738440990448, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24438980221748352, "step": 2156 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8439379860126179, "frac_reward_zero_std": 0.0, "grad_norm": 0.8682528783918683, "kl": 0.46337890625, "learning_rate": 1.4558059545351144e-06, "loss": 0.0046, "num_tokens": 969614240.0, "reward": 0.044677734375, "reward_std": 0.029997721314430237, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24290642142295837, "step": 2157 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.844329241453514, "frac_reward_zero_std": 0.0, "grad_norm": 0.7650148349851617, "kl": 0.436279296875, "learning_rate": 1.4487168788842721e-06, "loss": 0.0044, "num_tokens": 970168496.0, "reward": 0.041015625, "reward_std": 0.028291691094636917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.328125, "rewards/tag_count_reward/std": 0.23168183863162994, "step": 2158 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.84472049689441, "frac_reward_zero_std": 0.0, "grad_norm": 0.6646146397886646, "kl": 0.430908203125, "learning_rate": 1.4416437574429587e-06, "loss": 0.0043, "num_tokens": 970722528.0, "reward": 0.0430908203125, "reward_std": 0.029670197516679764, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.23858554661273956, "step": 2159 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8451117523353059, "frac_reward_zero_std": 0.0, "grad_norm": 0.7930698140938448, "kl": 0.42431640625, "learning_rate": 1.4345866034075628e-06, "loss": 0.0042, "num_tokens": 971276848.0, "reward": 0.0430908203125, "reward_std": 0.02933730185031891, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.2406313717365265, "step": 2160 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8455030077762019, "frac_reward_zero_std": 0.0, "grad_norm": 0.6782743217121455, "kl": 0.413330078125, "learning_rate": 1.4275454299446834e-06, "loss": 0.0041, "num_tokens": 971832448.0, "reward": 0.04345703125, "reward_std": 0.028911972418427467, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.23584304749965668, "step": 2161 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8458942632170978, "frac_reward_zero_std": 0.0, "grad_norm": 0.5696283200415766, "kl": 0.412353515625, "learning_rate": 1.4205202501911052e-06, "loss": 0.0041, "num_tokens": 972386784.0, "reward": 0.0450439453125, "reward_std": 0.030085723847150803, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.24510957300662994, "step": 2162 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8462855186579938, "frac_reward_zero_std": 0.0, "grad_norm": 0.8510423493317136, "kl": 0.392822265625, "learning_rate": 1.4135110772537685e-06, "loss": 0.0039, "num_tokens": 972944512.0, "reward": 0.0457763671875, "reward_std": 0.02933356910943985, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.2423757016658783, "step": 2163 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8466767740988899, "frac_reward_zero_std": 0.0, "grad_norm": 0.5619323496258705, "kl": 0.386474609375, "learning_rate": 1.4065179242097582e-06, "loss": 0.0039, "num_tokens": 973499104.0, "reward": 0.041748046875, "reward_std": 0.028973262757062912, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.2359323352575302, "step": 2164 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8470680295397858, "frac_reward_zero_std": 0.0, "grad_norm": 0.5850229740266566, "kl": 0.3935546875, "learning_rate": 1.3995408041062642e-06, "loss": 0.0039, "num_tokens": 974052384.0, "reward": 0.0458984375, "reward_std": 0.02908143401145935, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23418088257312775, "step": 2165 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8474592849806818, "frac_reward_zero_std": 0.0, "grad_norm": 0.7189387337870944, "kl": 0.3955078125, "learning_rate": 1.3925797299605649e-06, "loss": 0.004, "num_tokens": 974604752.0, "reward": 0.04541015625, "reward_std": 0.028845258057117462, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.23191313445568085, "step": 2166 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8478505404215777, "frac_reward_zero_std": 0.0, "grad_norm": 0.6374045920762919, "kl": 0.383544921875, "learning_rate": 1.3856347147600014e-06, "loss": 0.0038, "num_tokens": 975161392.0, "reward": 0.0462646484375, "reward_std": 0.030001819133758545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24748854339122772, "step": 2167 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8482417958624737, "frac_reward_zero_std": 0.0, "grad_norm": 0.6398836785287652, "kl": 0.373779296875, "learning_rate": 1.3787057714619535e-06, "loss": 0.0037, "num_tokens": 975717840.0, "reward": 0.043212890625, "reward_std": 0.03164727985858917, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.258428692817688, "step": 2168 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8486330513033696, "frac_reward_zero_std": 0.0, "grad_norm": 0.601941387519535, "kl": 0.3798828125, "learning_rate": 1.3717929129938179e-06, "loss": 0.0038, "num_tokens": 976275104.0, "reward": 0.0404052734375, "reward_std": 0.030525391921401024, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.2450626939535141, "step": 2169 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8490243067442657, "frac_reward_zero_std": 0.0, "grad_norm": 0.524895749250975, "kl": 0.370361328125, "learning_rate": 1.3648961522529802e-06, "loss": 0.0037, "num_tokens": 976827936.0, "reward": 0.040283203125, "reward_std": 0.02747073397040367, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.322265625, "rewards/tag_count_reward/std": 0.21951895952224731, "step": 2170 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8494155621851617, "frac_reward_zero_std": 0.0, "grad_norm": 0.568614551461628, "kl": 0.371337890625, "learning_rate": 1.3580155021067898e-06, "loss": 0.0037, "num_tokens": 977383072.0, "reward": 0.04443359375, "reward_std": 0.028041526675224304, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.2238464504480362, "step": 2171 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8498068176260576, "frac_reward_zero_std": 0.0, "grad_norm": 0.767195257850468, "kl": 0.3828125, "learning_rate": 1.3511509753925422e-06, "loss": 0.0038, "num_tokens": 977937824.0, "reward": 0.043212890625, "reward_std": 0.030925214290618896, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.2497471123933792, "step": 2172 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8501980730669536, "frac_reward_zero_std": 0.0, "grad_norm": 0.5944605772079581, "kl": 0.376953125, "learning_rate": 1.3443025849174485e-06, "loss": 0.0038, "num_tokens": 978490704.0, "reward": 0.0430908203125, "reward_std": 0.029214859008789062, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.23652203381061554, "step": 2173 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8505893285078495, "frac_reward_zero_std": 0.0, "grad_norm": 0.659366950790672, "kl": 0.377197265625, "learning_rate": 1.3374703434586133e-06, "loss": 0.0038, "num_tokens": 979045520.0, "reward": 0.046875, "reward_std": 0.0302298404276371, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.2465447634458542, "step": 2174 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8509805839487455, "frac_reward_zero_std": 0.0, "grad_norm": 0.7000024699791514, "kl": 0.375244140625, "learning_rate": 1.3306542637630194e-06, "loss": 0.0037, "num_tokens": 979601744.0, "reward": 0.042724609375, "reward_std": 0.029464520514011383, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.341796875, "rewards/tag_count_reward/std": 0.2392207384109497, "step": 2175 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8513718393896416, "frac_reward_zero_std": 0.0, "grad_norm": 0.7163495616059063, "kl": 0.39404296875, "learning_rate": 1.3238543585474871e-06, "loss": 0.0039, "num_tokens": 980154752.0, "reward": 0.044189453125, "reward_std": 0.03192325308918953, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.2572999894618988, "step": 2176 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8517630948305375, "frac_reward_zero_std": 0.0, "grad_norm": 0.6172243034606281, "kl": 0.385986328125, "learning_rate": 1.3170706404986645e-06, "loss": 0.0039, "num_tokens": 980707200.0, "reward": 0.043701171875, "reward_std": 0.03224213048815727, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.2626035809516907, "step": 2177 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8521543502714335, "frac_reward_zero_std": 0.0, "grad_norm": 0.5486026950875421, "kl": 0.383056640625, "learning_rate": 1.3103031222729977e-06, "loss": 0.0038, "num_tokens": 981260272.0, "reward": 0.0460205078125, "reward_std": 0.032578159123659134, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.24927707016468048, "step": 2178 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8525456057123294, "frac_reward_zero_std": 0.0, "grad_norm": 0.6456860853267801, "kl": 0.384033203125, "learning_rate": 1.3035518164967086e-06, "loss": 0.0038, "num_tokens": 981812512.0, "reward": 0.0438232421875, "reward_std": 0.02921324595808983, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.23363855481147766, "step": 2179 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8529368611532254, "frac_reward_zero_std": 0.0, "grad_norm": 0.6039646642963545, "kl": 0.37744140625, "learning_rate": 1.2968167357657746e-06, "loss": 0.0038, "num_tokens": 982366320.0, "reward": 0.0428466796875, "reward_std": 0.028375230729579926, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.22995422780513763, "step": 2180 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8533281165941213, "frac_reward_zero_std": 0.0, "grad_norm": 0.6110398521094126, "kl": 0.37451171875, "learning_rate": 1.2900978926458985e-06, "loss": 0.0037, "num_tokens": 982921456.0, "reward": 0.04833984375, "reward_std": 0.029238931834697723, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.24021922051906586, "step": 2181 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8537193720350174, "frac_reward_zero_std": 0.0, "grad_norm": 0.5920033087272102, "kl": 0.376953125, "learning_rate": 1.2833952996724864e-06, "loss": 0.0038, "num_tokens": 983476928.0, "reward": 0.04248046875, "reward_std": 0.026612568646669388, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.2163293957710266, "step": 2182 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8541106274759134, "frac_reward_zero_std": 0.0, "grad_norm": 0.5839747068948649, "kl": 0.39306640625, "learning_rate": 1.2767089693506363e-06, "loss": 0.0039, "num_tokens": 984029136.0, "reward": 0.042236328125, "reward_std": 0.028461404144763947, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.337890625, "rewards/tag_count_reward/std": 0.23134273290634155, "step": 2183 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8545018829168093, "frac_reward_zero_std": 0.0, "grad_norm": 0.6453457889829588, "kl": 0.37841796875, "learning_rate": 1.2700389141550884e-06, "loss": 0.0038, "num_tokens": 984582784.0, "reward": 0.0469970703125, "reward_std": 0.02836996130645275, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.23220813274383545, "step": 2184 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8548931383577053, "frac_reward_zero_std": 0.0, "grad_norm": 0.5356233714375672, "kl": 0.385009765625, "learning_rate": 1.263385146530234e-06, "loss": 0.0038, "num_tokens": 985136800.0, "reward": 0.0477294921875, "reward_std": 0.030107729136943817, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.24142581224441528, "step": 2185 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8552843937986012, "frac_reward_zero_std": 0.0, "grad_norm": 2.0960148724767795, "kl": 0.382080078125, "learning_rate": 1.2567476788900679e-06, "loss": 0.0038, "num_tokens": 985692080.0, "reward": 0.0443115234375, "reward_std": 0.02967800199985504, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.24267572164535522, "step": 2186 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8556756492394972, "frac_reward_zero_std": 0.0, "grad_norm": 0.7351976421920361, "kl": 0.388671875, "learning_rate": 1.2501265236181736e-06, "loss": 0.0039, "num_tokens": 986245920.0, "reward": 0.0438232421875, "reward_std": 0.029224392026662827, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.2412988692522049, "step": 2187 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8560669046803933, "frac_reward_zero_std": 0.0, "grad_norm": 0.5151159504486109, "kl": 0.394287109375, "learning_rate": 1.2435216930677108e-06, "loss": 0.0039, "num_tokens": 986799792.0, "reward": 0.04541015625, "reward_std": 0.03041963279247284, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.24526771903038025, "step": 2188 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8564581601212892, "frac_reward_zero_std": 0.0, "grad_norm": 0.5799755174008381, "kl": 0.37744140625, "learning_rate": 1.2369331995613664e-06, "loss": 0.0038, "num_tokens": 987358224.0, "reward": 0.0430908203125, "reward_std": 0.030177131295204163, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.24765869975090027, "step": 2189 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8568494155621852, "frac_reward_zero_std": 0.0, "grad_norm": 0.7968969469087344, "kl": 0.37939453125, "learning_rate": 1.2303610553913548e-06, "loss": 0.0038, "num_tokens": 987912560.0, "reward": 0.0447998046875, "reward_std": 0.026971234008669853, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.22517482936382294, "step": 2190 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8572406710030811, "frac_reward_zero_std": 0.0, "grad_norm": 0.5807174181436249, "kl": 0.37939453125, "learning_rate": 1.2238052728193927e-06, "loss": 0.0038, "num_tokens": 988466656.0, "reward": 0.046142578125, "reward_std": 0.029689114540815353, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2383868396282196, "step": 2191 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8576319264439771, "frac_reward_zero_std": 0.0, "grad_norm": 0.5053779296609073, "kl": 0.373779296875, "learning_rate": 1.2172658640766622e-06, "loss": 0.0037, "num_tokens": 989021600.0, "reward": 0.0498046875, "reward_std": 0.03136250376701355, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.25808021426200867, "step": 2192 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.858023181884873, "frac_reward_zero_std": 0.0, "grad_norm": 0.5730051546029993, "kl": 0.3779296875, "learning_rate": 1.2107428413637979e-06, "loss": 0.0038, "num_tokens": 989577792.0, "reward": 0.0458984375, "reward_std": 0.02971535176038742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23935678601264954, "step": 2193 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.858414437325769, "frac_reward_zero_std": 0.0, "grad_norm": 0.7137621262103646, "kl": 0.375732421875, "learning_rate": 1.2042362168508714e-06, "loss": 0.0038, "num_tokens": 990132432.0, "reward": 0.045166015625, "reward_std": 0.0297858789563179, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.23806531727313995, "step": 2194 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8588056927666651, "frac_reward_zero_std": 0.0, "grad_norm": 0.5617829020161041, "kl": 0.377685546875, "learning_rate": 1.1977460026773447e-06, "loss": 0.0038, "num_tokens": 990687712.0, "reward": 0.0419921875, "reward_std": 0.03018968738615513, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3359375, "rewards/tag_count_reward/std": 0.2454238086938858, "step": 2195 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.859196948207561, "frac_reward_zero_std": 0.0, "grad_norm": 0.7032370446588903, "kl": 0.385986328125, "learning_rate": 1.1912722109520791e-06, "loss": 0.0039, "num_tokens": 991242240.0, "reward": 0.044677734375, "reward_std": 0.029919996857643127, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.2449161559343338, "step": 2196 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.859588203648457, "frac_reward_zero_std": 0.0, "grad_norm": 0.5736214297517783, "kl": 0.387451171875, "learning_rate": 1.1848148537532845e-06, "loss": 0.0039, "num_tokens": 991797504.0, "reward": 0.0438232421875, "reward_std": 0.028329845517873764, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.2298542857170105, "step": 2197 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8599794590893529, "frac_reward_zero_std": 0.0, "grad_norm": 0.6710565850309299, "kl": 0.392578125, "learning_rate": 1.1783739431285123e-06, "loss": 0.0039, "num_tokens": 992351904.0, "reward": 0.04541015625, "reward_std": 0.02970237284898758, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.23999592661857605, "step": 2198 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8603707145302489, "frac_reward_zero_std": 0.0, "grad_norm": 0.7087381615600636, "kl": 0.431640625, "learning_rate": 1.171949491094635e-06, "loss": 0.0043, "num_tokens": 992903216.0, "reward": 0.0482177734375, "reward_std": 0.03019869700074196, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.24630969762802124, "step": 2199 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.860761969971145, "frac_reward_zero_std": 0.0, "grad_norm": 0.6470174800913586, "kl": 0.391845703125, "learning_rate": 1.165541509637812e-06, "loss": 0.0039, "num_tokens": 993456176.0, "reward": 0.043212890625, "reward_std": 0.03112759441137314, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.2517022490501404, "step": 2200 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8611532254120409, "frac_reward_zero_std": 0.0, "grad_norm": 0.5816392049402271, "kl": 0.404541015625, "learning_rate": 1.15915001071347e-06, "loss": 0.004, "num_tokens": 994009104.0, "reward": 0.046875, "reward_std": 0.03138978034257889, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.2543735206127167, "step": 2201 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8615444808529369, "frac_reward_zero_std": 0.0, "grad_norm": 1.076251457818846, "kl": 0.39208984375, "learning_rate": 1.1527750062462928e-06, "loss": 0.0039, "num_tokens": 994562432.0, "reward": 0.0457763671875, "reward_std": 0.02922971174120903, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.23623040318489075, "step": 2202 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8619357362938328, "frac_reward_zero_std": 0.0, "grad_norm": 0.6134831402462491, "kl": 0.400634765625, "learning_rate": 1.146416508130186e-06, "loss": 0.004, "num_tokens": 995117280.0, "reward": 0.0433349609375, "reward_std": 0.028146186843514442, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.2272571176290512, "step": 2203 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8623269917347288, "frac_reward_zero_std": 0.0, "grad_norm": 0.662344730826458, "kl": 0.395751953125, "learning_rate": 1.1400745282282566e-06, "loss": 0.004, "num_tokens": 995671424.0, "reward": 0.0438232421875, "reward_std": 0.02886171266436577, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.23512543737888336, "step": 2204 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8627182471756247, "frac_reward_zero_std": 0.0, "grad_norm": 0.6267818593065638, "kl": 0.402099609375, "learning_rate": 1.1337490783728033e-06, "loss": 0.004, "num_tokens": 996226944.0, "reward": 0.04638671875, "reward_std": 0.029515229165554047, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.2394527643918991, "step": 2205 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8631095026165208, "frac_reward_zero_std": 0.0, "grad_norm": 0.6070626594843525, "kl": 0.4111328125, "learning_rate": 1.1274401703652692e-06, "loss": 0.0041, "num_tokens": 996781024.0, "reward": 0.0477294921875, "reward_std": 0.03265084698796272, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.2600002884864807, "step": 2206 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8635007580574168, "frac_reward_zero_std": 0.0, "grad_norm": 0.6997603847952721, "kl": 0.4130859375, "learning_rate": 1.121147815976248e-06, "loss": 0.0041, "num_tokens": 997334736.0, "reward": 0.04638671875, "reward_std": 0.030355628579854965, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.2425040453672409, "step": 2207 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8638920134983127, "frac_reward_zero_std": 0.0, "grad_norm": 0.7513495567476132, "kl": 0.41357421875, "learning_rate": 1.114872026945445e-06, "loss": 0.0041, "num_tokens": 997888576.0, "reward": 0.04345703125, "reward_std": 0.03080843761563301, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.24701032042503357, "step": 2208 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8642832689392087, "frac_reward_zero_std": 0.0, "grad_norm": 0.6779190022051121, "kl": 0.42529296875, "learning_rate": 1.1086128149816544e-06, "loss": 0.0043, "num_tokens": 998442704.0, "reward": 0.047119140625, "reward_std": 0.02969776839017868, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.24049805104732513, "step": 2209 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8646745243801046, "frac_reward_zero_std": 0.0, "grad_norm": 0.7342138521092078, "kl": 0.41259765625, "learning_rate": 1.1023701917627527e-06, "loss": 0.0041, "num_tokens": 998997008.0, "reward": 0.0484619140625, "reward_std": 0.029565323144197464, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3876953125, "rewards/tag_count_reward/std": 0.23708805441856384, "step": 2210 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8650657798210006, "frac_reward_zero_std": 0.0, "grad_norm": 0.7089576965048253, "kl": 0.42138671875, "learning_rate": 1.096144168935659e-06, "loss": 0.0042, "num_tokens": 999550704.0, "reward": 0.046142578125, "reward_std": 0.03155044466257095, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.24334746599197388, "step": 2211 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8654570352618967, "frac_reward_zero_std": 0.0, "grad_norm": 0.5897560264256413, "kl": 0.425048828125, "learning_rate": 1.0899347581163222e-06, "loss": 0.0043, "num_tokens": 1000105152.0, "reward": 0.0416259765625, "reward_std": 0.027887098491191864, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3330078125, "rewards/tag_count_reward/std": 0.22620989382266998, "step": 2212 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8658482907027926, "frac_reward_zero_std": 0.0, "grad_norm": 0.6488251953667176, "kl": 0.418212890625, "learning_rate": 1.0837419708896979e-06, "loss": 0.0042, "num_tokens": 1000660000.0, "reward": 0.045166015625, "reward_std": 0.028848104178905487, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.23495639860630035, "step": 2213 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8662395461436886, "frac_reward_zero_std": 0.0, "grad_norm": 0.7807299420490004, "kl": 0.41943359375, "learning_rate": 1.077565818809727e-06, "loss": 0.0042, "num_tokens": 1001215968.0, "reward": 0.048828125, "reward_std": 0.02993520349264145, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.2389724850654602, "step": 2214 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8666308015845845, "frac_reward_zero_std": 0.0, "grad_norm": 0.7401547288351185, "kl": 0.43359375, "learning_rate": 1.071406313399318e-06, "loss": 0.0043, "num_tokens": 1001769920.0, "reward": 0.044189453125, "reward_std": 0.031761378049850464, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.24522867798805237, "step": 2215 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8670220570254805, "frac_reward_zero_std": 0.0, "grad_norm": 0.8985729913061327, "kl": 0.424072265625, "learning_rate": 1.065263466150317e-06, "loss": 0.0042, "num_tokens": 1002325424.0, "reward": 0.0428466796875, "reward_std": 0.027615990489721298, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.22888588905334473, "step": 2216 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8674133124663765, "frac_reward_zero_std": 0.0, "grad_norm": 0.7418068518505104, "kl": 0.429931640625, "learning_rate": 1.0591372885234885e-06, "loss": 0.0043, "num_tokens": 1002878016.0, "reward": 0.050048828125, "reward_std": 0.029979033395648003, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.400390625, "rewards/tag_count_reward/std": 0.24422717094421387, "step": 2217 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8678045679072725, "frac_reward_zero_std": 0.0, "grad_norm": 0.8123323310011206, "kl": 0.439453125, "learning_rate": 1.0530277919485044e-06, "loss": 0.0044, "num_tokens": 1003433936.0, "reward": 0.04443359375, "reward_std": 0.027953756973147392, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.22493872046470642, "step": 2218 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8681958233481685, "frac_reward_zero_std": 0.0, "grad_norm": 1.0189898225611116, "kl": 0.439697265625, "learning_rate": 1.0469349878239077e-06, "loss": 0.0044, "num_tokens": 1003987600.0, "reward": 0.04736328125, "reward_std": 0.02802186831831932, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.22898834943771362, "step": 2219 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8685870787890644, "frac_reward_zero_std": 0.0, "grad_norm": 0.674052774486317, "kl": 0.43359375, "learning_rate": 1.040858887517099e-06, "loss": 0.0043, "num_tokens": 1004543744.0, "reward": 0.046630859375, "reward_std": 0.031989965587854385, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.2591390311717987, "step": 2220 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8689783342299604, "frac_reward_zero_std": 0.0, "grad_norm": 0.6779655172086166, "kl": 0.431396484375, "learning_rate": 1.0347995023643198e-06, "loss": 0.0043, "num_tokens": 1005097232.0, "reward": 0.0465087890625, "reward_std": 0.031580254435539246, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.2524220049381256, "step": 2221 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8693695896708563, "frac_reward_zero_std": 0.0, "grad_norm": 0.7386849440662112, "kl": 0.427734375, "learning_rate": 1.0287568436706208e-06, "loss": 0.0043, "num_tokens": 1005653584.0, "reward": 0.048583984375, "reward_std": 0.02860783040523529, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.23180577158927917, "step": 2222 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 0.9921875, "completions/max_length": 2048.0, "completions/max_terminated_length": 1938.0, "completions/mean_length": 2047.5703125, "completions/mean_terminated_length": 1938.0, "completions/min_length": 1938.0, "completions/min_terminated_length": 1938.0, "epoch": 0.8697608451117523, "frac_reward_zero_std": 0.0, "grad_norm": 0.7501762006461941, "kl": 0.43212890625, "learning_rate": 1.022730922709847e-06, "loss": 0.0042, "num_tokens": 1006209394.0, "reward": 0.041748046875, "reward_std": 0.030211057513952255, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.333984375, "rewards/tag_count_reward/std": 0.24410168826580048, "step": 2223 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8701521005526484, "frac_reward_zero_std": 0.0, "grad_norm": 0.7621492881114289, "kl": 0.43115234375, "learning_rate": 1.0167217507246151e-06, "loss": 0.0043, "num_tokens": 1006763426.0, "reward": 0.046875, "reward_std": 0.030556246638298035, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.2504897117614746, "step": 2224 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8705433559935443, "frac_reward_zero_std": 0.0, "grad_norm": 0.825754760056279, "kl": 0.430419921875, "learning_rate": 1.0107293389262918e-06, "loss": 0.0043, "num_tokens": 1007316226.0, "reward": 0.046142578125, "reward_std": 0.030163051560521126, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.24043434858322144, "step": 2225 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8709346114344403, "frac_reward_zero_std": 0.0, "grad_norm": 0.7141116129582392, "kl": 0.425048828125, "learning_rate": 1.0047536984949813e-06, "loss": 0.0043, "num_tokens": 1007870194.0, "reward": 0.0452880859375, "reward_std": 0.02905551716685295, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.23708805441856384, "step": 2226 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8713258668753362, "frac_reward_zero_std": 0.0, "grad_norm": 0.669109813632136, "kl": 0.413330078125, "learning_rate": 9.987948405794912e-07, "loss": 0.0041, "num_tokens": 1008423890.0, "reward": 0.0458984375, "reward_std": 0.028563467785716057, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23313191533088684, "step": 2227 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8717171223162322, "frac_reward_zero_std": 0.0, "grad_norm": 0.6172979327428894, "kl": 0.422119140625, "learning_rate": 9.92852776297316e-07, "loss": 0.0042, "num_tokens": 1008977714.0, "reward": 0.04541015625, "reward_std": 0.033068716526031494, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.26818084716796875, "step": 2228 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8721083777571282, "frac_reward_zero_std": 0.0, "grad_norm": 0.6421106585374748, "kl": 0.42041015625, "learning_rate": 9.869275167346237e-07, "loss": 0.0042, "num_tokens": 1009531986.0, "reward": 0.043701171875, "reward_std": 0.030319824814796448, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.2472195327281952, "step": 2229 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8724996331980241, "frac_reward_zero_std": 0.0, "grad_norm": 0.7892180938819824, "kl": 0.420166015625, "learning_rate": 9.810190729462255e-07, "loss": 0.0042, "num_tokens": 1010085490.0, "reward": 0.04736328125, "reward_std": 0.03133229538798332, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2524087429046631, "step": 2230 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8728908886389202, "frac_reward_zero_std": 0.0, "grad_norm": 0.6986432015881942, "kl": 0.43505859375, "learning_rate": 9.75127455955559e-07, "loss": 0.0043, "num_tokens": 1010638898.0, "reward": 0.0450439453125, "reward_std": 0.027269527316093445, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.22312459349632263, "step": 2231 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8732821440798161, "frac_reward_zero_std": 0.0, "grad_norm": 0.7084255558383921, "kl": 0.423583984375, "learning_rate": 9.692526767546727e-07, "loss": 0.0042, "num_tokens": 1011192402.0, "reward": 0.0467529296875, "reward_std": 0.028514500707387924, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.22794699668884277, "step": 2232 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8736733995207121, "frac_reward_zero_std": 0.0, "grad_norm": 0.738039772378904, "kl": 0.422119140625, "learning_rate": 9.63394746304198e-07, "loss": 0.0042, "num_tokens": 1011744722.0, "reward": 0.0443115234375, "reward_std": 0.030239518731832504, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.24964551627635956, "step": 2233 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.874064654961608, "frac_reward_zero_std": 0.0, "grad_norm": 0.6418572396406403, "kl": 0.434814453125, "learning_rate": 9.57553675533328e-07, "loss": 0.0043, "num_tokens": 1012298306.0, "reward": 0.0447998046875, "reward_std": 0.02809428982436657, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.22841691970825195, "step": 2234 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.874455910402504, "frac_reward_zero_std": 0.0, "grad_norm": 0.6332127225538601, "kl": 0.42724609375, "learning_rate": 9.517294753398066e-07, "loss": 0.0043, "num_tokens": 1012852130.0, "reward": 0.04541015625, "reward_std": 0.02716030552983284, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.2232983112335205, "step": 2235 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8748471658434, "frac_reward_zero_std": 0.0, "grad_norm": 0.66649472327159, "kl": 0.423583984375, "learning_rate": 9.459221565898935e-07, "loss": 0.0042, "num_tokens": 1013407810.0, "reward": 0.043212890625, "reward_std": 0.028257988393306732, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.345703125, "rewards/tag_count_reward/std": 0.23246566951274872, "step": 2236 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.875238421284296, "frac_reward_zero_std": 0.0, "grad_norm": 0.6674690907167757, "kl": 0.4287109375, "learning_rate": 9.401317301183655e-07, "loss": 0.0043, "num_tokens": 1013961106.0, "reward": 0.044189453125, "reward_std": 0.028860535472631454, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.23226790130138397, "step": 2237 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.875629676725192, "frac_reward_zero_std": 0.0, "grad_norm": 0.6161196040616203, "kl": 0.42431640625, "learning_rate": 9.343582067284707e-07, "loss": 0.0042, "num_tokens": 1014516050.0, "reward": 0.0416259765625, "reward_std": 0.030807344242930412, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3330078125, "rewards/tag_count_reward/std": 0.24593627452850342, "step": 2238 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8760209321660879, "frac_reward_zero_std": 0.0, "grad_norm": 0.7607349362580104, "kl": 0.4248046875, "learning_rate": 9.286015971919282e-07, "loss": 0.0042, "num_tokens": 1015070898.0, "reward": 0.0458984375, "reward_std": 0.027795445173978806, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.22346974909305573, "step": 2239 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8764121876069839, "frac_reward_zero_std": 0.0, "grad_norm": 0.7003309919304589, "kl": 0.42919921875, "learning_rate": 9.22861912248898e-07, "loss": 0.0043, "num_tokens": 1015625026.0, "reward": 0.05126953125, "reward_std": 0.03146814554929733, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.41015625, "rewards/tag_count_reward/std": 0.25192275643348694, "step": 2240 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8768034430478799, "frac_reward_zero_std": 0.0, "grad_norm": 0.6178074585186288, "kl": 0.420654296875, "learning_rate": 9.171391626079629e-07, "loss": 0.0042, "num_tokens": 1016180242.0, "reward": 0.046630859375, "reward_std": 0.02808782272040844, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.22794069349765778, "step": 2241 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8771946984887758, "frac_reward_zero_std": 0.0, "grad_norm": 0.7416971673166355, "kl": 0.429443359375, "learning_rate": 9.114333589461144e-07, "loss": 0.0043, "num_tokens": 1016733042.0, "reward": 0.0439453125, "reward_std": 0.029260706156492233, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23833060264587402, "step": 2242 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8775859539296719, "frac_reward_zero_std": 0.0, "grad_norm": 0.69735358394065, "kl": 0.422119140625, "learning_rate": 9.057445119087216e-07, "loss": 0.0042, "num_tokens": 1017286946.0, "reward": 0.047119140625, "reward_std": 0.028874173760414124, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.2394767552614212, "step": 2243 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8779772093705678, "frac_reward_zero_std": 0.0, "grad_norm": 0.6053838607421819, "kl": 0.41552734375, "learning_rate": 9.00072632109521e-07, "loss": 0.0042, "num_tokens": 1017841282.0, "reward": 0.0443115234375, "reward_std": 0.029516585171222687, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.23549975454807281, "step": 2244 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8783684648114638, "frac_reward_zero_std": 0.0, "grad_norm": 0.7824565600138768, "kl": 0.42041015625, "learning_rate": 8.944177301305912e-07, "loss": 0.0042, "num_tokens": 1018395378.0, "reward": 0.0404052734375, "reward_std": 0.02872810699045658, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3232421875, "rewards/tag_count_reward/std": 0.23484838008880615, "step": 2245 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8787597202523597, "frac_reward_zero_std": 0.0, "grad_norm": 0.7125348583417772, "kl": 0.420654296875, "learning_rate": 8.887798165223382e-07, "loss": 0.0042, "num_tokens": 1018950066.0, "reward": 0.044677734375, "reward_std": 0.03098445013165474, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.2395407110452652, "step": 2246 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8791509756932557, "frac_reward_zero_std": 0.0, "grad_norm": 8.933850456297963, "kl": 0.415283203125, "learning_rate": 8.831589018034659e-07, "loss": 0.0042, "num_tokens": 1019504898.0, "reward": 0.046142578125, "reward_std": 0.0300262663513422, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2513977587223053, "step": 2247 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8795422311341518, "frac_reward_zero_std": 0.0, "grad_norm": 0.7270193468736067, "kl": 0.41455078125, "learning_rate": 8.775549964609742e-07, "loss": 0.0041, "num_tokens": 1020058498.0, "reward": 0.04833984375, "reward_std": 0.029811210930347443, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.24225124716758728, "step": 2248 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8799334865750477, "frac_reward_zero_std": 0.0, "grad_norm": 0.7569133683393398, "kl": 0.40869140625, "learning_rate": 8.719681109501177e-07, "loss": 0.0041, "num_tokens": 1020611682.0, "reward": 0.0455322265625, "reward_std": 0.02974740043282509, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.24026504158973694, "step": 2249 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8803247420159437, "frac_reward_zero_std": 0.0, "grad_norm": 0.6551950324430823, "kl": 0.413330078125, "learning_rate": 8.66398255694404e-07, "loss": 0.0041, "num_tokens": 1021166834.0, "reward": 0.0469970703125, "reward_std": 0.03043985180556774, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.24454645812511444, "step": 2250 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8807159974568396, "frac_reward_zero_std": 0.0, "grad_norm": 0.7452265595595373, "kl": 0.414306640625, "learning_rate": 8.608454410855627e-07, "loss": 0.0041, "num_tokens": 1021720034.0, "reward": 0.04443359375, "reward_std": 0.02839672565460205, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.22818417847156525, "step": 2251 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8811072528977356, "frac_reward_zero_std": 0.0, "grad_norm": 0.719891602402114, "kl": 0.4091796875, "learning_rate": 8.553096774835312e-07, "loss": 0.0041, "num_tokens": 1022273474.0, "reward": 0.044189453125, "reward_std": 0.029586058109998703, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.23748549818992615, "step": 2252 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8814985083386316, "frac_reward_zero_std": 0.0, "grad_norm": 0.5992780062579696, "kl": 0.40625, "learning_rate": 8.49790975216439e-07, "loss": 0.0041, "num_tokens": 1022828098.0, "reward": 0.0458984375, "reward_std": 0.030614729970693588, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.24840176105499268, "step": 2253 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8818897637795275, "frac_reward_zero_std": 0.0, "grad_norm": 0.723451577259307, "kl": 0.41162109375, "learning_rate": 8.4428934458058e-07, "loss": 0.0041, "num_tokens": 1023382482.0, "reward": 0.0467529296875, "reward_std": 0.030745521187782288, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2504878044128418, "step": 2254 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8822810192204236, "frac_reward_zero_std": 0.0, "grad_norm": 0.7266308913649207, "kl": 0.418701171875, "learning_rate": 8.388047958403955e-07, "loss": 0.0042, "num_tokens": 1023937666.0, "reward": 0.0435791015625, "reward_std": 0.029203353449702263, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.2410924732685089, "step": 2255 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8826722746613195, "frac_reward_zero_std": 0.0, "grad_norm": 0.7688329646180998, "kl": 0.41064453125, "learning_rate": 8.33337339228466e-07, "loss": 0.0041, "num_tokens": 1024491794.0, "reward": 0.0478515625, "reward_std": 0.03039863146841526, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.2464204579591751, "step": 2256 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8830635301022155, "frac_reward_zero_std": 0.0, "grad_norm": 0.7108429059765371, "kl": 0.423095703125, "learning_rate": 8.278869849454718e-07, "loss": 0.0042, "num_tokens": 1025043858.0, "reward": 0.0438232421875, "reward_std": 0.029772121459245682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.23616555333137512, "step": 2257 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8834547855431114, "frac_reward_zero_std": 0.0, "grad_norm": 0.8120187120828294, "kl": 0.431640625, "learning_rate": 8.224537431601886e-07, "loss": 0.0043, "num_tokens": 1025597058.0, "reward": 0.0440673828125, "reward_std": 0.03022567741572857, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.24750401079654694, "step": 2258 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8838460409840074, "frac_reward_zero_std": 0.0, "grad_norm": 0.7066958382614271, "kl": 0.421142578125, "learning_rate": 8.170376240094724e-07, "loss": 0.0042, "num_tokens": 1026150242.0, "reward": 0.04345703125, "reward_std": 0.028625957667827606, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.23058828711509705, "step": 2259 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8842372964249035, "frac_reward_zero_std": 0.0, "grad_norm": 0.7475997194466105, "kl": 0.412109375, "learning_rate": 8.116386375982244e-07, "loss": 0.0041, "num_tokens": 1026706210.0, "reward": 0.0482177734375, "reward_std": 0.03162175044417381, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.2570226192474365, "step": 2260 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8846285518657994, "frac_reward_zero_std": 0.0, "grad_norm": 0.7293590652543483, "kl": 0.4287109375, "learning_rate": 8.062567939993838e-07, "loss": 0.0043, "num_tokens": 1027258530.0, "reward": 0.0479736328125, "reward_std": 0.031227558851242065, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.25325506925582886, "step": 2261 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8850198073066954, "frac_reward_zero_std": 0.0, "grad_norm": 0.7109726192232256, "kl": 0.421875, "learning_rate": 8.008921032539108e-07, "loss": 0.0042, "num_tokens": 1027813570.0, "reward": 0.0474853515625, "reward_std": 0.03099970892071724, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.24748854339122772, "step": 2262 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8854110627475913, "frac_reward_zero_std": 0.0, "grad_norm": 0.6808881444585761, "kl": 0.41455078125, "learning_rate": 7.955445753707536e-07, "loss": 0.0041, "num_tokens": 1028367698.0, "reward": 0.0440673828125, "reward_std": 0.030397683382034302, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.2435106784105301, "step": 2263 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8858023181884873, "frac_reward_zero_std": 0.0, "grad_norm": 0.816207549423929, "kl": 0.433349609375, "learning_rate": 7.902142203268515e-07, "loss": 0.0043, "num_tokens": 1028919762.0, "reward": 0.0472412109375, "reward_std": 0.029869060963392258, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2384571135044098, "step": 2264 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8861935736293833, "frac_reward_zero_std": 0.0, "grad_norm": 0.627737939288875, "kl": 0.4130859375, "learning_rate": 7.849010480670938e-07, "loss": 0.0041, "num_tokens": 1029474034.0, "reward": 0.049560546875, "reward_std": 0.02967381849884987, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.23364879190921783, "step": 2265 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8865848290702792, "frac_reward_zero_std": 0.0, "grad_norm": 0.8076427868918209, "kl": 0.417236328125, "learning_rate": 7.796050685043166e-07, "loss": 0.0042, "num_tokens": 1030028834.0, "reward": 0.0484619140625, "reward_std": 0.029460828751325607, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3876953125, "rewards/tag_count_reward/std": 0.236052006483078, "step": 2266 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8869760845111753, "frac_reward_zero_std": 0.0, "grad_norm": 0.8064209847735511, "kl": 0.416748046875, "learning_rate": 7.743262915192839e-07, "loss": 0.0042, "num_tokens": 1030584498.0, "reward": 0.0477294921875, "reward_std": 0.028861435130238533, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.23525570333003998, "step": 2267 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8873673399520712, "frac_reward_zero_std": 0.0, "grad_norm": 0.7223975395755791, "kl": 0.4140625, "learning_rate": 7.69064726960651e-07, "loss": 0.0041, "num_tokens": 1031138530.0, "reward": 0.046875, "reward_std": 0.029503371566534042, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24050600826740265, "step": 2268 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8877585953929672, "frac_reward_zero_std": 0.0, "grad_norm": 0.9656745194619926, "kl": 0.41650390625, "learning_rate": 7.638203846449754e-07, "loss": 0.0042, "num_tokens": 1031691154.0, "reward": 0.0438232421875, "reward_std": 0.02722657099366188, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3505859375, "rewards/tag_count_reward/std": 0.22226475179195404, "step": 2269 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8881498508338631, "frac_reward_zero_std": 0.0, "grad_norm": 0.7040959274030336, "kl": 0.412353515625, "learning_rate": 7.585932743566749e-07, "loss": 0.0041, "num_tokens": 1032245458.0, "reward": 0.04736328125, "reward_std": 0.02901991456747055, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2394527643918991, "step": 2270 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8885411062747591, "frac_reward_zero_std": 0.0, "grad_norm": 0.8463151889261792, "kl": 0.40576171875, "learning_rate": 7.53383405848016e-07, "loss": 0.0041, "num_tokens": 1032802786.0, "reward": 0.044677734375, "reward_std": 0.028770089149475098, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.23259742558002472, "step": 2271 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8889323617156552, "frac_reward_zero_std": 0.0, "grad_norm": 0.9632666666080328, "kl": 0.416259765625, "learning_rate": 7.481907888390994e-07, "loss": 0.0042, "num_tokens": 1033356354.0, "reward": 0.04638671875, "reward_std": 0.03105630725622177, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24750594794750214, "step": 2272 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8893236171565511, "frac_reward_zero_std": 0.0, "grad_norm": 0.7008056516451338, "kl": 0.408447265625, "learning_rate": 7.43015433017844e-07, "loss": 0.0041, "num_tokens": 1033909554.0, "reward": 0.0474853515625, "reward_std": 0.030001726001501083, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.23943477869033813, "step": 2273 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8897148725974471, "frac_reward_zero_std": 0.0, "grad_norm": 0.587915152732619, "kl": 0.40087890625, "learning_rate": 7.378573480399543e-07, "loss": 0.004, "num_tokens": 1034463234.0, "reward": 0.048095703125, "reward_std": 0.031219879165291786, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.2512758672237396, "step": 2274 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.890106128038343, "frac_reward_zero_std": 0.0, "grad_norm": 0.5543016211646307, "kl": 0.409912109375, "learning_rate": 7.327165435289219e-07, "loss": 0.0041, "num_tokens": 1035015602.0, "reward": 0.0457763671875, "reward_std": 0.028626002371311188, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.23204314708709717, "step": 2275 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.890497383479239, "frac_reward_zero_std": 0.0, "grad_norm": 0.7594419633156322, "kl": 0.4072265625, "learning_rate": 7.275930290759925e-07, "loss": 0.0041, "num_tokens": 1035568834.0, "reward": 0.048583984375, "reward_std": 0.029316261410713196, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.2359972447156906, "step": 2276 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.890888638920135, "frac_reward_zero_std": 0.0, "grad_norm": 0.9635879702096238, "kl": 0.4111328125, "learning_rate": 7.224868142401542e-07, "loss": 0.0041, "num_tokens": 1036120114.0, "reward": 0.0445556640625, "reward_std": 0.029066478833556175, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.23670007288455963, "step": 2277 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8912798943610309, "frac_reward_zero_std": 0.0, "grad_norm": 0.6119001625580149, "kl": 0.40771484375, "learning_rate": 7.173979085481264e-07, "loss": 0.0041, "num_tokens": 1036674290.0, "reward": 0.048828125, "reward_std": 0.030004287138581276, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.2410150170326233, "step": 2278 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.891671149801927, "frac_reward_zero_std": 0.0, "grad_norm": 0.6451977991364766, "kl": 0.39892578125, "learning_rate": 7.123263214943199e-07, "loss": 0.004, "num_tokens": 1037228658.0, "reward": 0.047607421875, "reward_std": 0.030293742194771767, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2424645721912384, "step": 2279 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8920624052428229, "frac_reward_zero_std": 0.0, "grad_norm": 0.6753586141030364, "kl": 0.40625, "learning_rate": 7.072720625408469e-07, "loss": 0.0041, "num_tokens": 1037782498.0, "reward": 0.0462646484375, "reward_std": 0.028352729976177216, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.22681856155395508, "step": 2280 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8924536606837189, "frac_reward_zero_std": 0.0, "grad_norm": 0.6567848634275293, "kl": 0.400634765625, "learning_rate": 7.022351411174866e-07, "loss": 0.004, "num_tokens": 1038337026.0, "reward": 0.047119140625, "reward_std": 0.02791133150458336, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.22686287760734558, "step": 2281 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8928449161246148, "frac_reward_zero_std": 0.0, "grad_norm": 0.6317409684327918, "kl": 0.409423828125, "learning_rate": 6.972155666216684e-07, "loss": 0.0041, "num_tokens": 1038889170.0, "reward": 0.046142578125, "reward_std": 0.030088214203715324, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2434733361005783, "step": 2282 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8932361715655108, "frac_reward_zero_std": 0.0, "grad_norm": 110.34646490321637, "kl": 0.517333984375, "learning_rate": 6.922133484184612e-07, "loss": 0.0052, "num_tokens": 1039440162.0, "reward": 0.0477294921875, "reward_std": 0.032436687499284744, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.2609412670135498, "step": 2283 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8936274270064068, "frac_reward_zero_std": 0.0, "grad_norm": 0.6623521615661824, "kl": 0.40380859375, "learning_rate": 6.872284958405528e-07, "loss": 0.004, "num_tokens": 1039993762.0, "reward": 0.046630859375, "reward_std": 0.02901475876569748, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.23220194876194, "step": 2284 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8940186824473028, "frac_reward_zero_std": 0.0, "grad_norm": 0.6615284782561036, "kl": 0.402099609375, "learning_rate": 6.82261018188225e-07, "loss": 0.004, "num_tokens": 1040548322.0, "reward": 0.0457763671875, "reward_std": 0.03026755154132843, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24438980221748352, "step": 2285 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8944099378881988, "frac_reward_zero_std": 0.0, "grad_norm": 0.9616689165174108, "kl": 0.404296875, "learning_rate": 6.773109247293497e-07, "loss": 0.004, "num_tokens": 1041103170.0, "reward": 0.0433349609375, "reward_std": 0.028714455664157867, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.22833307087421417, "step": 2286 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8948011933290947, "frac_reward_zero_std": 0.0, "grad_norm": 0.7934102258589469, "kl": 0.409912109375, "learning_rate": 6.723782246993648e-07, "loss": 0.0041, "num_tokens": 1041656898.0, "reward": 0.0439453125, "reward_std": 0.028773196041584015, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23833060264587402, "step": 2287 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8951924487699907, "frac_reward_zero_std": 0.0, "grad_norm": 0.7609560748398041, "kl": 0.409912109375, "learning_rate": 6.674629273012511e-07, "loss": 0.0041, "num_tokens": 1042210674.0, "reward": 0.0498046875, "reward_std": 0.030864167958498, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3984375, "rewards/tag_count_reward/std": 0.2493865042924881, "step": 2288 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8955837042108867, "frac_reward_zero_std": 0.0, "grad_norm": 0.6727015984450869, "kl": 0.408203125, "learning_rate": 6.625650417055296e-07, "loss": 0.0041, "num_tokens": 1042765666.0, "reward": 0.0455322265625, "reward_std": 0.029006754979491234, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.2351091504096985, "step": 2289 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8959749596517826, "frac_reward_zero_std": 0.0, "grad_norm": 0.5844565875183114, "kl": 0.407470703125, "learning_rate": 6.576845770502305e-07, "loss": 0.0041, "num_tokens": 1043319746.0, "reward": 0.046875, "reward_std": 0.029434993863105774, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24253563582897186, "step": 2290 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8963662150926787, "frac_reward_zero_std": 0.0, "grad_norm": 0.6973656692948732, "kl": 0.408935546875, "learning_rate": 6.528215424408812e-07, "loss": 0.0041, "num_tokens": 1043873666.0, "reward": 0.045166015625, "reward_std": 0.029795493930578232, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2411341667175293, "step": 2291 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8967574705335746, "frac_reward_zero_std": 0.0, "grad_norm": 0.7266327180613176, "kl": 0.400390625, "learning_rate": 6.479759469504931e-07, "loss": 0.004, "num_tokens": 1044428530.0, "reward": 0.0491943359375, "reward_std": 0.02937890589237213, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3935546875, "rewards/tag_count_reward/std": 0.2387620508670807, "step": 2292 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8971487259744706, "frac_reward_zero_std": 0.0, "grad_norm": 0.7541935414619624, "kl": 0.40673828125, "learning_rate": 6.431477996195357e-07, "loss": 0.0041, "num_tokens": 1044983170.0, "reward": 0.0452880859375, "reward_std": 0.028743186965584755, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.23501139879226685, "step": 2293 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8975399814153665, "frac_reward_zero_std": 0.0, "grad_norm": 0.6312619957651008, "kl": 0.41552734375, "learning_rate": 6.383371094559343e-07, "loss": 0.0042, "num_tokens": 1045539650.0, "reward": 0.045166015625, "reward_std": 0.029397230595350266, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2390926480293274, "step": 2294 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8979312368562625, "frac_reward_zero_std": 0.0, "grad_norm": 0.7360583066369446, "kl": 0.4150390625, "learning_rate": 6.335438854350351e-07, "loss": 0.0042, "num_tokens": 1046091378.0, "reward": 0.044921875, "reward_std": 0.03191526606678963, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.2507036626338959, "step": 2295 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8983224922971585, "frac_reward_zero_std": 0.0, "grad_norm": 0.6299248742027472, "kl": 0.4130859375, "learning_rate": 6.287681364996035e-07, "loss": 0.0041, "num_tokens": 1046644354.0, "reward": 0.0479736328125, "reward_std": 0.02981504425406456, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.24136236310005188, "step": 2296 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8987137477380545, "frac_reward_zero_std": 0.0, "grad_norm": 0.6614764695099058, "kl": 0.418701171875, "learning_rate": 6.240098715597975e-07, "loss": 0.0042, "num_tokens": 1047197698.0, "reward": 0.0455322265625, "reward_std": 0.029922405257821083, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.24531260132789612, "step": 2297 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8991050031789505, "frac_reward_zero_std": 0.0, "grad_norm": 0.9319180879523344, "kl": 0.40966796875, "learning_rate": 6.192690994931582e-07, "loss": 0.0041, "num_tokens": 1047752226.0, "reward": 0.048828125, "reward_std": 0.030431073158979416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.2420298159122467, "step": 2298 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8994962586198464, "frac_reward_zero_std": 0.0, "grad_norm": 0.8072211088113599, "kl": 0.416748046875, "learning_rate": 6.14545829144586e-07, "loss": 0.0042, "num_tokens": 1048307010.0, "reward": 0.0462646484375, "reward_std": 0.02960357442498207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.23737864196300507, "step": 2299 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.8998875140607424, "frac_reward_zero_std": 0.0, "grad_norm": 0.7988746040189011, "kl": 0.41845703125, "learning_rate": 6.098400693263351e-07, "loss": 0.0042, "num_tokens": 1048861650.0, "reward": 0.0467529296875, "reward_std": 0.029513897374272346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2514643967151642, "step": 2300 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9002787695016384, "frac_reward_zero_std": 0.0, "grad_norm": 0.9357815935293798, "kl": 0.4189453125, "learning_rate": 6.051518288179847e-07, "loss": 0.0042, "num_tokens": 1049416034.0, "reward": 0.0469970703125, "reward_std": 0.03071928396821022, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.2465428113937378, "step": 2301 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9006700249425343, "frac_reward_zero_std": 0.0, "grad_norm": 0.704601585542822, "kl": 0.421142578125, "learning_rate": 6.00481116366427e-07, "loss": 0.0042, "num_tokens": 1049966722.0, "reward": 0.04638671875, "reward_std": 0.029695942997932434, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.2394527643918991, "step": 2302 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9010612803834304, "frac_reward_zero_std": 0.0, "grad_norm": 0.8151970275189613, "kl": 0.417236328125, "learning_rate": 5.958279406858569e-07, "loss": 0.0042, "num_tokens": 1050520562.0, "reward": 0.0465087890625, "reward_std": 0.02951786294579506, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23637627065181732, "step": 2303 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9014525358243263, "frac_reward_zero_std": 0.0, "grad_norm": 0.8951616558609696, "kl": 0.41455078125, "learning_rate": 5.911923104577455e-07, "loss": 0.0041, "num_tokens": 1051075298.0, "reward": 0.0521240234375, "reward_std": 0.030753757804632187, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4169921875, "rewards/tag_count_reward/std": 0.24493764340877533, "step": 2304 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9018437912652223, "frac_reward_zero_std": 0.0, "grad_norm": 0.7869274580561746, "kl": 0.406982421875, "learning_rate": 5.865742343308345e-07, "loss": 0.0041, "num_tokens": 1051632082.0, "reward": 0.04296875, "reward_std": 0.02802223712205887, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.2268713116645813, "step": 2305 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9022350467061182, "frac_reward_zero_std": 0.0, "grad_norm": 0.8653137028691663, "kl": 0.4130859375, "learning_rate": 5.819737209211107e-07, "loss": 0.0041, "num_tokens": 1052187218.0, "reward": 0.046142578125, "reward_std": 0.029457610100507736, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.24043434858322144, "step": 2306 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9026263021470142, "frac_reward_zero_std": 0.0, "grad_norm": 0.6561386693060391, "kl": 0.4140625, "learning_rate": 5.77390778811796e-07, "loss": 0.0041, "num_tokens": 1052740162.0, "reward": 0.0478515625, "reward_std": 0.02890576422214508, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.23313191533088684, "step": 2307 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9030175575879102, "frac_reward_zero_std": 0.0, "grad_norm": 1.9143742244916606, "kl": 0.416259765625, "learning_rate": 5.728254165533276e-07, "loss": 0.0042, "num_tokens": 1053295986.0, "reward": 0.0478515625, "reward_std": 0.02833862043917179, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.23418088257312775, "step": 2308 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9034088130288062, "frac_reward_zero_std": 0.0, "grad_norm": 0.683094570555641, "kl": 0.418701171875, "learning_rate": 5.682776426633452e-07, "loss": 0.0042, "num_tokens": 1053848834.0, "reward": 0.0457763671875, "reward_std": 0.028573263436555862, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.23098446428775787, "step": 2309 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9038000684697022, "frac_reward_zero_std": 0.0, "grad_norm": 0.6142070968849617, "kl": 0.412353515625, "learning_rate": 5.63747465626674e-07, "loss": 0.0041, "num_tokens": 1054402338.0, "reward": 0.0433349609375, "reward_std": 0.02815636619925499, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.22940398752689362, "step": 2310 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9041913239105981, "frac_reward_zero_std": 0.0, "grad_norm": 0.6720102832320677, "kl": 0.416259765625, "learning_rate": 5.592348938953085e-07, "loss": 0.0042, "num_tokens": 1054956850.0, "reward": 0.0447998046875, "reward_std": 0.0294752586632967, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.24094946682453156, "step": 2311 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9045825793514941, "frac_reward_zero_std": 0.0, "grad_norm": 0.6638549775496072, "kl": 0.41796875, "learning_rate": 5.547399358883953e-07, "loss": 0.0042, "num_tokens": 1055510306.0, "reward": 0.0477294921875, "reward_std": 0.028577983379364014, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.22685232758522034, "step": 2312 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9049738347923901, "frac_reward_zero_std": 0.0, "grad_norm": 0.7436698418886464, "kl": 0.418701171875, "learning_rate": 5.502625999922207e-07, "loss": 0.0042, "num_tokens": 1056065618.0, "reward": 0.0465087890625, "reward_std": 0.02903435379266739, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23429329693317413, "step": 2313 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.905365090233286, "frac_reward_zero_std": 0.0, "grad_norm": 0.7232250189867582, "kl": 0.412841796875, "learning_rate": 5.458028945601912e-07, "loss": 0.0041, "num_tokens": 1056618978.0, "reward": 0.044921875, "reward_std": 0.02835910953581333, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.2284860759973526, "step": 2314 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9057563456741821, "frac_reward_zero_std": 0.0, "grad_norm": 0.7702744489300719, "kl": 0.415283203125, "learning_rate": 5.413608279128213e-07, "loss": 0.0042, "num_tokens": 1057174802.0, "reward": 0.0460205078125, "reward_std": 0.031248774379491806, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.24843066930770874, "step": 2315 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.906147601115078, "frac_reward_zero_std": 0.0, "grad_norm": 0.7592662316627966, "kl": 0.43408203125, "learning_rate": 5.369364083377182e-07, "loss": 0.0043, "num_tokens": 1057728722.0, "reward": 0.0469970703125, "reward_std": 0.028878286480903625, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.23008742928504944, "step": 2316 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.906538856555974, "frac_reward_zero_std": 0.0, "grad_norm": 0.6957120528528464, "kl": 0.422119140625, "learning_rate": 5.325296440895622e-07, "loss": 0.0042, "num_tokens": 1058281954.0, "reward": 0.0423583984375, "reward_std": 0.029518458992242813, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3388671875, "rewards/tag_count_reward/std": 0.23776550590991974, "step": 2317 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9069301119968699, "frac_reward_zero_std": 0.0, "grad_norm": 0.7923634869228461, "kl": 0.419921875, "learning_rate": 5.281405433900966e-07, "loss": 0.0042, "num_tokens": 1058835154.0, "reward": 0.0472412109375, "reward_std": 0.028961893171072006, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.23741090297698975, "step": 2318 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9073213674377659, "frac_reward_zero_std": 0.0, "grad_norm": 0.7906846998358358, "kl": 0.427978515625, "learning_rate": 5.237691144281054e-07, "loss": 0.0043, "num_tokens": 1059388610.0, "reward": 0.0447998046875, "reward_std": 0.028306178748607635, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.22841691970825195, "step": 2319 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.907712622878662, "frac_reward_zero_std": 0.0, "grad_norm": 0.6804037451764183, "kl": 0.418212890625, "learning_rate": 5.194153653594036e-07, "loss": 0.0042, "num_tokens": 1059944162.0, "reward": 0.05322265625, "reward_std": 0.044300276786088943, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.23765476047992706, "step": 2320 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9081038783195579, "frac_reward_zero_std": 0.0, "grad_norm": 0.8049478807231343, "kl": 0.43017578125, "learning_rate": 5.150793043068269e-07, "loss": 0.0043, "num_tokens": 1060499842.0, "reward": 0.049072265625, "reward_std": 0.03204552084207535, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.2556874454021454, "step": 2321 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9084951337604539, "frac_reward_zero_std": 0.0, "grad_norm": 0.7627787404562039, "kl": 0.423828125, "learning_rate": 5.107609393602019e-07, "loss": 0.0042, "num_tokens": 1061052450.0, "reward": 0.0467529296875, "reward_std": 0.0311207864433527, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2572460174560547, "step": 2322 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9088863892013498, "frac_reward_zero_std": 0.0, "grad_norm": 0.671192403023806, "kl": 0.4306640625, "learning_rate": 5.064602785763417e-07, "loss": 0.0043, "num_tokens": 1061606818.0, "reward": 0.0458984375, "reward_std": 0.030209723860025406, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.24240928888320923, "step": 2323 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9092776446422458, "frac_reward_zero_std": 0.0, "grad_norm": 0.8397597083476399, "kl": 0.425048828125, "learning_rate": 5.021773299790356e-07, "loss": 0.0042, "num_tokens": 1062160610.0, "reward": 0.04736328125, "reward_std": 0.03368523344397545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2665766477584839, "step": 2324 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9096689000831418, "frac_reward_zero_std": 0.0, "grad_norm": 0.8697997148536608, "kl": 0.42724609375, "learning_rate": 4.979121015590138e-07, "loss": 0.0043, "num_tokens": 1062715586.0, "reward": 0.0482177734375, "reward_std": 0.031981293112039566, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.26174721121788025, "step": 2325 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9100601555240377, "frac_reward_zero_std": 0.0, "grad_norm": 0.6674210923066376, "kl": 0.4267578125, "learning_rate": 4.936646012739554e-07, "loss": 0.0043, "num_tokens": 1063270546.0, "reward": 0.0447998046875, "reward_std": 0.03190524876117706, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.25382906198501587, "step": 2326 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9104514109649338, "frac_reward_zero_std": 0.0, "grad_norm": 4.985483520526144, "kl": 0.420654296875, "learning_rate": 4.894348370484648e-07, "loss": 0.0042, "num_tokens": 1063823970.0, "reward": 0.046142578125, "reward_std": 0.028418362140655518, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.23107771575450897, "step": 2327 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9108426664058297, "frac_reward_zero_std": 0.0, "grad_norm": 0.9263454594575663, "kl": 0.435546875, "learning_rate": 4.852228167740503e-07, "loss": 0.0044, "num_tokens": 1064380114.0, "reward": 0.0472412109375, "reward_std": 0.02958439290523529, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.23741090297698975, "step": 2328 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9112339218467257, "frac_reward_zero_std": 0.0, "grad_norm": 1.157043304597432, "kl": 0.4326171875, "learning_rate": 4.810285483091181e-07, "loss": 0.0043, "num_tokens": 1064932066.0, "reward": 0.047119140625, "reward_std": 0.030727367848157883, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.24454058706760406, "step": 2329 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9116251772876216, "frac_reward_zero_std": 0.0, "grad_norm": 0.834597369065928, "kl": 0.422119140625, "learning_rate": 4.768520394789544e-07, "loss": 0.0042, "num_tokens": 1065487650.0, "reward": 0.046142578125, "reward_std": 0.0302869975566864, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.24043434858322144, "step": 2330 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9120164327285176, "frac_reward_zero_std": 0.0, "grad_norm": 1.1327459243341598, "kl": 0.425048828125, "learning_rate": 4.726932980757115e-07, "loss": 0.0043, "num_tokens": 1066042786.0, "reward": 0.0467529296875, "reward_std": 0.028792429715394974, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.23639246821403503, "step": 2331 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9124076881694136, "frac_reward_zero_std": 0.0, "grad_norm": 0.7747234348490335, "kl": 0.432373046875, "learning_rate": 4.6855233185839175e-07, "loss": 0.0043, "num_tokens": 1066596418.0, "reward": 0.0465087890625, "reward_std": 0.02983901835978031, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.23525570333003998, "step": 2332 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9127989436103096, "frac_reward_zero_std": 0.0, "grad_norm": 0.82592593307419, "kl": 0.43017578125, "learning_rate": 4.644291485528363e-07, "loss": 0.0043, "num_tokens": 1067146866.0, "reward": 0.0428466796875, "reward_std": 0.02918728068470955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3427734375, "rewards/tag_count_reward/std": 0.2341788411140442, "step": 2333 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9131901990512056, "frac_reward_zero_std": 0.0, "grad_norm": 0.8099607686888719, "kl": 0.43359375, "learning_rate": 4.6032375585170486e-07, "loss": 0.0043, "num_tokens": 1067702226.0, "reward": 0.044921875, "reward_std": 0.029715191572904587, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.23999592661857605, "step": 2334 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9135814544921015, "frac_reward_zero_std": 0.0, "grad_norm": 0.7012151806859838, "kl": 0.43017578125, "learning_rate": 4.562361614144717e-07, "loss": 0.0043, "num_tokens": 1068255682.0, "reward": 0.04736328125, "reward_std": 0.02970942296087742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.24149124324321747, "step": 2335 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9139727099329975, "frac_reward_zero_std": 0.0, "grad_norm": 0.8195917216268312, "kl": 0.4296875, "learning_rate": 4.5216637286739416e-07, "loss": 0.0043, "num_tokens": 1068810770.0, "reward": 0.0477294921875, "reward_std": 0.03049113228917122, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.24344776570796967, "step": 2336 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9143639653738935, "frac_reward_zero_std": 0.0, "grad_norm": 0.8041021035144205, "kl": 0.4287109375, "learning_rate": 4.481143978035196e-07, "loss": 0.0043, "num_tokens": 1069365378.0, "reward": 0.0467529296875, "reward_std": 0.029184162616729736, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.23639246821403503, "step": 2337 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9147552208147894, "frac_reward_zero_std": 0.0, "grad_norm": 0.656320449846947, "kl": 0.424072265625, "learning_rate": 4.4408024378265413e-07, "loss": 0.0042, "num_tokens": 1069920722.0, "reward": 0.0465087890625, "reward_std": 0.028976470232009888, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23533709347248077, "step": 2338 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9151464762556855, "frac_reward_zero_std": 0.0, "grad_norm": 0.9512571332100611, "kl": 0.43505859375, "learning_rate": 4.400639183313571e-07, "loss": 0.0043, "num_tokens": 1070473938.0, "reward": 0.0450439453125, "reward_std": 0.02675144001841545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.21529796719551086, "step": 2339 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9155377316965814, "frac_reward_zero_std": 0.0, "grad_norm": 0.7273838439987345, "kl": 0.42529296875, "learning_rate": 4.360654289429267e-07, "loss": 0.0043, "num_tokens": 1071028290.0, "reward": 0.044921875, "reward_std": 0.029266037046909332, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.236912339925766, "step": 2340 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9159289871374774, "frac_reward_zero_std": 0.0, "grad_norm": 0.7866352797418218, "kl": 0.43798828125, "learning_rate": 4.3208478307738e-07, "loss": 0.0044, "num_tokens": 1071582114.0, "reward": 0.0445556640625, "reward_std": 0.02876538783311844, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.2356623113155365, "step": 2341 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9163202425783733, "frac_reward_zero_std": 0.0, "grad_norm": 0.7102793764577604, "kl": 0.41845703125, "learning_rate": 4.281219881614451e-07, "loss": 0.0042, "num_tokens": 1072136450.0, "reward": 0.0457763671875, "reward_std": 0.030906379222869873, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24935387074947357, "step": 2342 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9167114980192693, "frac_reward_zero_std": 0.0, "grad_norm": 0.8846234738789192, "kl": 0.455810546875, "learning_rate": 4.2417705158854795e-07, "loss": 0.0046, "num_tokens": 1072689074.0, "reward": 0.0460205078125, "reward_std": 0.03167707473039627, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.25524333119392395, "step": 2343 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9171027534601653, "frac_reward_zero_std": 0.0, "grad_norm": 1.0482950369095216, "kl": 0.44677734375, "learning_rate": 4.202499807187932e-07, "loss": 0.0045, "num_tokens": 1073244178.0, "reward": 0.04638671875, "reward_std": 0.030121009796857834, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24451708793640137, "step": 2344 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9174940089010613, "frac_reward_zero_std": 0.0, "grad_norm": 0.7864612756358316, "kl": 0.426025390625, "learning_rate": 4.163407828789523e-07, "loss": 0.0043, "num_tokens": 1073797026.0, "reward": 0.0511474609375, "reward_std": 0.029225314036011696, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4091796875, "rewards/tag_count_reward/std": 0.23702344298362732, "step": 2345 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9178852643419573, "frac_reward_zero_std": 0.0, "grad_norm": 0.8624076632061674, "kl": 0.424560546875, "learning_rate": 4.1244946536245554e-07, "loss": 0.0042, "num_tokens": 1074350562.0, "reward": 0.044189453125, "reward_std": 0.030603859573602676, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.24659912288188934, "step": 2346 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9182765197828532, "frac_reward_zero_std": 0.0, "grad_norm": 0.8198976201842637, "kl": 0.417724609375, "learning_rate": 4.0857603542936776e-07, "loss": 0.0042, "num_tokens": 1074903394.0, "reward": 0.047119140625, "reward_std": 0.03414611518383026, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.265677273273468, "step": 2347 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9186677752237492, "frac_reward_zero_std": 0.0, "grad_norm": 0.6568061249547116, "kl": 0.418701171875, "learning_rate": 4.047205003063859e-07, "loss": 0.0042, "num_tokens": 1075458322.0, "reward": 0.0496826171875, "reward_std": 0.029706193134188652, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3974609375, "rewards/tag_count_reward/std": 0.24250207841396332, "step": 2348 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9190590306646452, "frac_reward_zero_std": 0.0, "grad_norm": 0.7413026200590273, "kl": 0.41796875, "learning_rate": 4.0088286718681703e-07, "loss": 0.0042, "num_tokens": 1076013170.0, "reward": 0.0460205078125, "reward_std": 0.030100267380475998, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.24142581224441528, "step": 2349 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9194502861055411, "frac_reward_zero_std": 0.0, "grad_norm": 0.8330686633543375, "kl": 0.428466796875, "learning_rate": 3.9706314323056936e-07, "loss": 0.0043, "num_tokens": 1076564930.0, "reward": 0.046630859375, "reward_std": 0.026932697743177414, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.22249938547611237, "step": 2350 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9198415415464372, "frac_reward_zero_std": 0.0, "grad_norm": 0.7441926157771712, "kl": 0.419921875, "learning_rate": 3.932613355641379e-07, "loss": 0.0042, "num_tokens": 1077117986.0, "reward": 0.0440673828125, "reward_std": 0.028986500576138496, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.2373947650194168, "step": 2351 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9202327969873331, "frac_reward_zero_std": 0.0, "grad_norm": 0.8388973806513261, "kl": 0.419677734375, "learning_rate": 3.894774512805932e-07, "loss": 0.0042, "num_tokens": 1077670706.0, "reward": 0.045166015625, "reward_std": 0.029960639774799347, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.24416443705558777, "step": 2352 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9206240524282291, "frac_reward_zero_std": 0.0, "grad_norm": 0.7659973225744483, "kl": 0.421142578125, "learning_rate": 3.857114974395604e-07, "loss": 0.0042, "num_tokens": 1078224242.0, "reward": 0.0482177734375, "reward_std": 0.0312562994658947, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.25414571166038513, "step": 2353 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9210153078691251, "frac_reward_zero_std": 0.0, "grad_norm": 0.9954514572425048, "kl": 0.423095703125, "learning_rate": 3.819634810672168e-07, "loss": 0.0042, "num_tokens": 1078777682.0, "reward": 0.0458984375, "reward_std": 0.028714314103126526, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23626485466957092, "step": 2354 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.921406563310021, "frac_reward_zero_std": 0.0, "grad_norm": 0.8125932134798833, "kl": 0.417236328125, "learning_rate": 3.782334091562723e-07, "loss": 0.0042, "num_tokens": 1079329810.0, "reward": 0.050537109375, "reward_std": 0.029979007318615913, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.404296875, "rewards/tag_count_reward/std": 0.2407526969909668, "step": 2355 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.921797818750917, "frac_reward_zero_std": 0.0, "grad_norm": 0.6993803829733978, "kl": 0.416748046875, "learning_rate": 3.7452128866595547e-07, "loss": 0.0042, "num_tokens": 1079884434.0, "reward": 0.04443359375, "reward_std": 0.029170256108045578, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.23970851302146912, "step": 2356 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.922189074191813, "frac_reward_zero_std": 0.0, "grad_norm": 0.820144331784485, "kl": 0.417724609375, "learning_rate": 3.708271265220087e-07, "loss": 0.0042, "num_tokens": 1080437810.0, "reward": 0.044677734375, "reward_std": 0.030399540439248085, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24690952897071838, "step": 2357 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.922580329632709, "frac_reward_zero_std": 0.0, "grad_norm": 0.6090489793409426, "kl": 0.42041015625, "learning_rate": 3.6715092961665866e-07, "loss": 0.0042, "num_tokens": 1080988418.0, "reward": 0.046630859375, "reward_std": 0.030426137149333954, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.24353623390197754, "step": 2358 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9229715850736049, "frac_reward_zero_std": 0.0, "grad_norm": 0.6661440231028686, "kl": 0.41455078125, "learning_rate": 3.6349270480862566e-07, "loss": 0.0041, "num_tokens": 1081543218.0, "reward": 0.0450439453125, "reward_std": 0.029044587165117264, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.23800699412822723, "step": 2359 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9233628405145009, "frac_reward_zero_std": 0.0, "grad_norm": 0.892750399216865, "kl": 0.409423828125, "learning_rate": 3.598524589230923e-07, "loss": 0.0041, "num_tokens": 1082098130.0, "reward": 0.0452880859375, "reward_std": 0.028690390288829803, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.2308020144701004, "step": 2360 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9237540959553969, "frac_reward_zero_std": 0.0, "grad_norm": 0.7172598260479605, "kl": 0.412109375, "learning_rate": 3.5623019875169916e-07, "loss": 0.0041, "num_tokens": 1082651074.0, "reward": 0.0477294921875, "reward_std": 0.03100251406431198, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.2513729929924011, "step": 2361 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9241453513962928, "frac_reward_zero_std": 0.0, "grad_norm": 0.6442119127405795, "kl": 0.407958984375, "learning_rate": 3.5262593105253374e-07, "loss": 0.0041, "num_tokens": 1083203922.0, "reward": 0.0445556640625, "reward_std": 0.03160176798701286, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.24551546573638916, "step": 2362 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9245366068371889, "frac_reward_zero_std": 0.0, "grad_norm": 1.0919654712601148, "kl": 0.40966796875, "learning_rate": 3.4903966255010915e-07, "loss": 0.0041, "num_tokens": 1083758722.0, "reward": 0.0469970703125, "reward_std": 0.029787981882691383, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.2384571135044098, "step": 2363 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9249278622780848, "frac_reward_zero_std": 0.0, "grad_norm": 0.7733363570928128, "kl": 0.408935546875, "learning_rate": 3.4547139993536205e-07, "loss": 0.0041, "num_tokens": 1084311986.0, "reward": 0.0484619140625, "reward_std": 0.03034406155347824, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3876953125, "rewards/tag_count_reward/std": 0.24621640145778656, "step": 2364 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9253191177189808, "frac_reward_zero_std": 0.0, "grad_norm": 0.919037915591073, "kl": 0.407470703125, "learning_rate": 3.4192114986563386e-07, "loss": 0.0041, "num_tokens": 1084868722.0, "reward": 0.0439453125, "reward_std": 0.028406579047441483, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23522518575191498, "step": 2365 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9257103731598768, "frac_reward_zero_std": 0.0, "grad_norm": 0.7214991045754489, "kl": 0.40966796875, "learning_rate": 3.383889189646583e-07, "loss": 0.0041, "num_tokens": 1085421586.0, "reward": 0.047119140625, "reward_std": 0.030655454844236374, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.25145867466926575, "step": 2366 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9261016286007727, "frac_reward_zero_std": 0.0, "grad_norm": 0.749307807249115, "kl": 0.409423828125, "learning_rate": 3.3487471382255277e-07, "loss": 0.0041, "num_tokens": 1085975666.0, "reward": 0.044921875, "reward_std": 0.029660172760486603, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.23999592661857605, "step": 2367 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9264928840416687, "frac_reward_zero_std": 0.0, "grad_norm": 0.8401043496847368, "kl": 0.40185546875, "learning_rate": 3.313785409958026e-07, "loss": 0.004, "num_tokens": 1086529986.0, "reward": 0.045654296875, "reward_std": 0.03091522678732872, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.2512758672237396, "step": 2368 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9268841394825647, "frac_reward_zero_std": 0.0, "grad_norm": 0.8560907039539225, "kl": 0.407470703125, "learning_rate": 3.2790040700725114e-07, "loss": 0.0041, "num_tokens": 1087082658.0, "reward": 0.0462646484375, "reward_std": 0.02897673100233078, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.23321199417114258, "step": 2369 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9272753949234607, "frac_reward_zero_std": 0.0, "grad_norm": 0.9026245804044901, "kl": 0.410888671875, "learning_rate": 3.2444031834608427e-07, "loss": 0.0041, "num_tokens": 1087636322.0, "reward": 0.0489501953125, "reward_std": 0.030247189104557037, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3876953125, "rewards/tag_count_reward/std": 0.23811960220336914, "step": 2370 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9276666503643566, "frac_reward_zero_std": 0.0, "grad_norm": 1.0189354421429662, "kl": 0.396484375, "learning_rate": 3.2099828146782364e-07, "loss": 0.004, "num_tokens": 1088192802.0, "reward": 0.04931640625, "reward_std": 0.031336136162281036, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.25361964106559753, "step": 2371 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9280579058052526, "frac_reward_zero_std": 0.0, "grad_norm": 0.7957730142853291, "kl": 0.41455078125, "learning_rate": 3.175743027943079e-07, "loss": 0.0041, "num_tokens": 1088746242.0, "reward": 0.0439453125, "reward_std": 0.02825760468840599, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.23313191533088684, "step": 2372 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9284491612461486, "frac_reward_zero_std": 0.0, "grad_norm": 87.41509287141254, "kl": 0.494140625, "learning_rate": 3.1416838871368925e-07, "loss": 0.0049, "num_tokens": 1089298514.0, "reward": 0.048828125, "reward_std": 0.030860386788845062, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.38671875, "rewards/tag_count_reward/std": 0.24225124716758728, "step": 2373 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9288404166870445, "frac_reward_zero_std": 0.0, "grad_norm": 0.8610332400817252, "kl": 0.41015625, "learning_rate": 3.107805455804114e-07, "loss": 0.0041, "num_tokens": 1089851762.0, "reward": 0.0458984375, "reward_std": 0.029814042150974274, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.23833060264587402, "step": 2374 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9292316721279406, "frac_reward_zero_std": 0.0, "grad_norm": 0.7869605078424312, "kl": 0.41552734375, "learning_rate": 3.074107797152059e-07, "loss": 0.0042, "num_tokens": 1090407122.0, "reward": 0.0440673828125, "reward_std": 0.030197400599718094, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.24551546573638916, "step": 2375 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9296229275688365, "frac_reward_zero_std": 0.0, "grad_norm": 0.689808700993645, "kl": 0.40673828125, "learning_rate": 3.0405909740507613e-07, "loss": 0.0041, "num_tokens": 1090960290.0, "reward": 0.0433349609375, "reward_std": 0.030655790120363235, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3466796875, "rewards/tag_count_reward/std": 0.2537536025047302, "step": 2376 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9300141830097325, "frac_reward_zero_std": 0.0, "grad_norm": 0.8203351858358818, "kl": 0.404541015625, "learning_rate": 3.0072550490328754e-07, "loss": 0.004, "num_tokens": 1091514610.0, "reward": 0.0487060546875, "reward_std": 0.03130338340997696, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3896484375, "rewards/tag_count_reward/std": 0.2549130618572235, "step": 2377 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9304054384506285, "frac_reward_zero_std": 0.0, "grad_norm": 0.672352866074737, "kl": 0.397705078125, "learning_rate": 2.974100084293563e-07, "loss": 0.004, "num_tokens": 1092070338.0, "reward": 0.0455322265625, "reward_std": 0.03166327252984047, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.24410757422447205, "step": 2378 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9307966938915244, "frac_reward_zero_std": 0.0, "grad_norm": 0.8194228281070425, "kl": 0.40087890625, "learning_rate": 2.9411261416903425e-07, "loss": 0.004, "num_tokens": 1092624770.0, "reward": 0.04638671875, "reward_std": 0.0290945116430521, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.2404741644859314, "step": 2379 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9311879493324204, "frac_reward_zero_std": 0.0, "grad_norm": 0.7191789578183841, "kl": 0.40625, "learning_rate": 2.9083332827430253e-07, "loss": 0.0041, "num_tokens": 1093177826.0, "reward": 0.0439453125, "reward_std": 0.03139212355017662, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3515625, "rewards/tag_count_reward/std": 0.24840176105499268, "step": 2380 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9315792047733163, "frac_reward_zero_std": 0.0, "grad_norm": 0.8025485225346116, "kl": 0.413330078125, "learning_rate": 2.875721568633527e-07, "loss": 0.0041, "num_tokens": 1093730530.0, "reward": 0.0482177734375, "reward_std": 0.03180832043290138, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.258922815322876, "step": 2381 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9319704602142124, "frac_reward_zero_std": 0.0, "grad_norm": 0.7356464586834165, "kl": 0.3984375, "learning_rate": 2.843291060205855e-07, "loss": 0.004, "num_tokens": 1094284466.0, "reward": 0.0478515625, "reward_std": 0.030523333698511124, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.24741309881210327, "step": 2382 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9323617156551083, "frac_reward_zero_std": 0.0, "grad_norm": 0.8242242986995717, "kl": 0.40185546875, "learning_rate": 2.8110418179658983e-07, "loss": 0.004, "num_tokens": 1094840498.0, "reward": 0.0465087890625, "reward_std": 0.028932852670550346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23429329693317413, "step": 2383 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9327529710960043, "frac_reward_zero_std": 0.0, "grad_norm": 0.7966523416980877, "kl": 0.4111328125, "learning_rate": 2.778973902081394e-07, "loss": 0.0041, "num_tokens": 1095393842.0, "reward": 0.049560546875, "reward_std": 0.029973894357681274, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.24157844483852386, "step": 2384 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9331442265369003, "frac_reward_zero_std": 0.0, "grad_norm": 0.7161002542876325, "kl": 0.40478515625, "learning_rate": 2.7470873723817405e-07, "loss": 0.004, "num_tokens": 1095947042.0, "reward": 0.0440673828125, "reward_std": 0.02868770807981491, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.23427695035934448, "step": 2385 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9335354819777962, "frac_reward_zero_std": 0.0, "grad_norm": 0.7314414088409195, "kl": 0.400634765625, "learning_rate": 2.715382288357937e-07, "loss": 0.004, "num_tokens": 1096502754.0, "reward": 0.044921875, "reward_std": 0.029651228338479996, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.236912339925766, "step": 2386 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9339267374186923, "frac_reward_zero_std": 0.0, "grad_norm": 0.7134973286294604, "kl": 0.404052734375, "learning_rate": 2.683858709162468e-07, "loss": 0.004, "num_tokens": 1097058050.0, "reward": 0.04638671875, "reward_std": 0.029984209686517715, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24351264536380768, "step": 2387 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9343179928595882, "frac_reward_zero_std": 0.0, "grad_norm": 0.774694431015463, "kl": 0.42138671875, "learning_rate": 2.6525166936091416e-07, "loss": 0.0042, "num_tokens": 1097609874.0, "reward": 0.045166015625, "reward_std": 0.027195177972316742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2264573723077774, "step": 2388 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9347092483004842, "frac_reward_zero_std": 0.0, "grad_norm": 0.6449136835563681, "kl": 0.40576171875, "learning_rate": 2.6213563001730834e-07, "loss": 0.0041, "num_tokens": 1098162738.0, "reward": 0.048828125, "reward_std": 0.029037343338131905, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.236912339925766, "step": 2389 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9351005037413802, "frac_reward_zero_std": 0.0, "grad_norm": 0.7710362352710103, "kl": 0.405029296875, "learning_rate": 2.5903775869905337e-07, "loss": 0.004, "num_tokens": 1098716450.0, "reward": 0.045166015625, "reward_std": 0.029453923925757408, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.24011556804180145, "step": 2390 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9354917591822761, "frac_reward_zero_std": 0.0, "grad_norm": 0.6707861497813098, "kl": 0.402587890625, "learning_rate": 2.5595806118587474e-07, "loss": 0.004, "num_tokens": 1099270178.0, "reward": 0.0496826171875, "reward_std": 0.02967997081577778, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3974609375, "rewards/tag_count_reward/std": 0.23945076763629913, "step": 2391 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9358830146231721, "frac_reward_zero_std": 0.0, "grad_norm": 0.8245491611147729, "kl": 0.40771484375, "learning_rate": 2.5289654322359526e-07, "loss": 0.0041, "num_tokens": 1099824242.0, "reward": 0.0465087890625, "reward_std": 0.030266499146819115, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.2465272843837738, "step": 2392 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.936274270064068, "frac_reward_zero_std": 0.0, "grad_norm": 0.7974953508525183, "kl": 0.415283203125, "learning_rate": 2.498532105241158e-07, "loss": 0.0042, "num_tokens": 1100375922.0, "reward": 0.0460205078125, "reward_std": 0.02919076755642891, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.23938678205013275, "step": 2393 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9366655255049641, "frac_reward_zero_std": 0.0, "grad_norm": 0.7816375714169971, "kl": 0.3974609375, "learning_rate": 2.468280687654134e-07, "loss": 0.004, "num_tokens": 1100927890.0, "reward": 0.0458984375, "reward_std": 0.029398297891020775, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.23296760022640228, "step": 2394 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.93705678094586, "frac_reward_zero_std": 0.0, "grad_norm": 0.8783587862426684, "kl": 0.4033203125, "learning_rate": 2.438211235915211e-07, "loss": 0.004, "num_tokens": 1101480002.0, "reward": 0.051513671875, "reward_std": 0.028702616691589355, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.412109375, "rewards/tag_count_reward/std": 0.23864373564720154, "step": 2395 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.937448036386756, "frac_reward_zero_std": 0.0, "grad_norm": 0.7955068541260782, "kl": 0.399169921875, "learning_rate": 2.4083238061252565e-07, "loss": 0.004, "num_tokens": 1102034130.0, "reward": 0.051025390625, "reward_std": 0.030969075858592987, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.404296875, "rewards/tag_count_reward/std": 0.23870791494846344, "step": 2396 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.937839291827652, "frac_reward_zero_std": 0.0, "grad_norm": 0.8198539070763493, "kl": 0.408935546875, "learning_rate": 2.3786184540455449e-07, "loss": 0.0041, "num_tokens": 1102586370.0, "reward": 0.0501708984375, "reward_std": 0.032222386449575424, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.4013671875, "rewards/tag_count_reward/std": 0.26063287258148193, "step": 2397 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9382305472685479, "frac_reward_zero_std": 0.0, "grad_norm": 0.7394588431894822, "kl": 0.40625, "learning_rate": 2.3490952350976205e-07, "loss": 0.0041, "num_tokens": 1103139650.0, "reward": 0.04541015625, "reward_std": 0.029426854103803635, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.23610270023345947, "step": 2398 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.938621802709444, "frac_reward_zero_std": 0.0, "grad_norm": 0.7555357574560209, "kl": 0.401123046875, "learning_rate": 2.3197542043632115e-07, "loss": 0.004, "num_tokens": 1103693154.0, "reward": 0.048095703125, "reward_std": 0.029790226370096207, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.23928476870059967, "step": 2399 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9390130581503399, "frac_reward_zero_std": 0.0, "grad_norm": 0.7209554153438222, "kl": 0.4072265625, "learning_rate": 2.2905954165841847e-07, "loss": 0.0041, "num_tokens": 1104246322.0, "reward": 0.0472412109375, "reward_std": 0.027241824194788933, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.22138427197933197, "step": 2400 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9394043135912359, "frac_reward_zero_std": 0.0, "grad_norm": 0.7332916964376107, "kl": 0.399169921875, "learning_rate": 2.2616189261623568e-07, "loss": 0.004, "num_tokens": 1104800882.0, "reward": 0.0447998046875, "reward_std": 0.02945086546242237, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.2389063686132431, "step": 2401 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9397955690321319, "frac_reward_zero_std": 0.0, "grad_norm": 1.18597906510565, "kl": 0.4130859375, "learning_rate": 2.2328247871594379e-07, "loss": 0.0041, "num_tokens": 1105355698.0, "reward": 0.0460205078125, "reward_std": 0.029161855578422546, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.23836073279380798, "step": 2402 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9401868244730278, "frac_reward_zero_std": 0.0, "grad_norm": 0.840748057933645, "kl": 0.40576171875, "learning_rate": 2.204213053296922e-07, "loss": 0.0041, "num_tokens": 1105909026.0, "reward": 0.0474853515625, "reward_std": 0.029975401237607002, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.24248628318309784, "step": 2403 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9405780799139238, "frac_reward_zero_std": 0.0, "grad_norm": 0.6525810065734086, "kl": 0.405517578125, "learning_rate": 2.1757837779559865e-07, "loss": 0.0041, "num_tokens": 1106465170.0, "reward": 0.0465087890625, "reward_std": 0.028335409238934517, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23113363981246948, "step": 2404 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9409693353548197, "frac_reward_zero_std": 0.0, "grad_norm": 0.6635146894263474, "kl": 0.40478515625, "learning_rate": 2.147537014177403e-07, "loss": 0.004, "num_tokens": 1107017714.0, "reward": 0.04345703125, "reward_std": 0.029370982199907303, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.23688000440597534, "step": 2405 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9413605907957158, "frac_reward_zero_std": 0.0, "grad_norm": 0.6638446438162483, "kl": 0.40087890625, "learning_rate": 2.1194728146614386e-07, "loss": 0.004, "num_tokens": 1107571666.0, "reward": 0.049560546875, "reward_std": 0.02817726507782936, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.2269303798675537, "step": 2406 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9417518462366117, "frac_reward_zero_std": 0.0, "grad_norm": 0.668806006620245, "kl": 0.404052734375, "learning_rate": 2.091591231767709e-07, "loss": 0.004, "num_tokens": 1108125810.0, "reward": 0.044189453125, "reward_std": 0.027884162962436676, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.353515625, "rewards/tag_count_reward/std": 0.23121026158332825, "step": 2407 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9421431016775077, "frac_reward_zero_std": 0.0, "grad_norm": 0.7275301322212233, "kl": 0.40625, "learning_rate": 2.0638923175151815e-07, "loss": 0.0041, "num_tokens": 1108681138.0, "reward": 0.05029296875, "reward_std": 0.030017200857400894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.40234375, "rewards/tag_count_reward/std": 0.24199818074703217, "step": 2408 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9425343571184037, "frac_reward_zero_std": 0.0, "grad_norm": 0.6558977354792045, "kl": 0.408203125, "learning_rate": 2.0363761235819402e-07, "loss": 0.0041, "num_tokens": 1109233698.0, "reward": 0.0479736328125, "reward_std": 0.02944951318204403, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.2453906536102295, "step": 2409 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9429256125592996, "frac_reward_zero_std": 0.0, "grad_norm": 0.6371172289708381, "kl": 0.401611328125, "learning_rate": 2.0090427013052305e-07, "loss": 0.004, "num_tokens": 1109790050.0, "reward": 0.047607421875, "reward_std": 0.029525378718972206, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.24145159125328064, "step": 2410 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9433168680001957, "frac_reward_zero_std": 0.0, "grad_norm": 0.7466486689589004, "kl": 0.40234375, "learning_rate": 1.981892101681271e-07, "loss": 0.004, "num_tokens": 1110344066.0, "reward": 0.04541015625, "reward_std": 0.029941026121377945, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.24021922051906586, "step": 2411 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9437081234410916, "frac_reward_zero_std": 0.0, "grad_norm": 0.7554645539739018, "kl": 0.396240234375, "learning_rate": 1.954924375365197e-07, "loss": 0.004, "num_tokens": 1110899266.0, "reward": 0.0447998046875, "reward_std": 0.028965231031179428, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.23476684093475342, "step": 2412 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9440993788819876, "frac_reward_zero_std": 0.0, "grad_norm": 0.6782678943672378, "kl": 0.405517578125, "learning_rate": 1.9281395726709394e-07, "loss": 0.0041, "num_tokens": 1111452146.0, "reward": 0.0416259765625, "reward_std": 0.028047244995832443, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3330078125, "rewards/tag_count_reward/std": 0.2272908091545105, "step": 2413 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9444906343228836, "frac_reward_zero_std": 0.0, "grad_norm": 0.622463018702681, "kl": 0.39794921875, "learning_rate": 1.901537743571169e-07, "loss": 0.004, "num_tokens": 1112005682.0, "reward": 0.047119140625, "reward_std": 0.02902739681303501, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.23845107853412628, "step": 2414 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9448818897637795, "frac_reward_zero_std": 0.0, "grad_norm": 0.8421550224020656, "kl": 0.403564453125, "learning_rate": 1.8751189376971623e-07, "loss": 0.004, "num_tokens": 1112557986.0, "reward": 0.04931640625, "reward_std": 0.02961297519505024, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.24072882533073425, "step": 2415 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9452731452046755, "frac_reward_zero_std": 0.0, "grad_norm": 0.8435113584698233, "kl": 0.400390625, "learning_rate": 1.8488832043387363e-07, "loss": 0.004, "num_tokens": 1113112882.0, "reward": 0.0416259765625, "reward_std": 0.02880815789103508, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3330078125, "rewards/tag_count_reward/std": 0.22836661338806152, "step": 2416 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9456644006455714, "frac_reward_zero_std": 0.0, "grad_norm": 0.656442806016146, "kl": 0.4033203125, "learning_rate": 1.8228305924441469e-07, "loss": 0.004, "num_tokens": 1113665010.0, "reward": 0.0440673828125, "reward_std": 0.028822410851716995, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.23842498660087585, "step": 2417 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9460556560864675, "frac_reward_zero_std": 0.0, "grad_norm": 0.6864038903524311, "kl": 0.45849609375, "learning_rate": 1.79696115061998e-07, "loss": 0.0046, "num_tokens": 1114218018.0, "reward": 0.0482177734375, "reward_std": 0.029948413372039795, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.24229668080806732, "step": 2418 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9464469115273634, "frac_reward_zero_std": 0.0, "grad_norm": 0.8349105259310101, "kl": 0.399169921875, "learning_rate": 1.7712749271311392e-07, "loss": 0.004, "num_tokens": 1114771362.0, "reward": 0.0440673828125, "reward_std": 0.02825385518372059, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.23111706972122192, "step": 2419 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9468381669682594, "frac_reward_zero_std": 0.0, "grad_norm": 0.671560015393701, "kl": 0.410888671875, "learning_rate": 1.7457719699006114e-07, "loss": 0.0041, "num_tokens": 1115323714.0, "reward": 0.0462646484375, "reward_std": 0.029735397547483444, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24248628318309784, "step": 2420 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9472294224091554, "frac_reward_zero_std": 0.0, "grad_norm": 0.8549499794755154, "kl": 0.40478515625, "learning_rate": 1.7204523265095252e-07, "loss": 0.004, "num_tokens": 1115876770.0, "reward": 0.0467529296875, "reward_std": 0.029834948480129242, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2435421347618103, "step": 2421 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9476206778500513, "frac_reward_zero_std": 0.0, "grad_norm": 0.7026195216253988, "kl": 0.402099609375, "learning_rate": 1.6953160441969707e-07, "loss": 0.004, "num_tokens": 1116430546.0, "reward": 0.0462646484375, "reward_std": 0.030606430023908615, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24449947476387024, "step": 2422 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9480119332909474, "frac_reward_zero_std": 0.0, "grad_norm": 0.7778811661732465, "kl": 0.403564453125, "learning_rate": 1.6703631698599455e-07, "loss": 0.004, "num_tokens": 1116985330.0, "reward": 0.045166015625, "reward_std": 0.03120092675089836, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.25011488795280457, "step": 2423 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9484031887318433, "frac_reward_zero_std": 0.0, "grad_norm": 0.7394677778495944, "kl": 0.407958984375, "learning_rate": 1.645593750053276e-07, "loss": 0.0041, "num_tokens": 1117538130.0, "reward": 0.048583984375, "reward_std": 0.03218340128660202, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.2541852593421936, "step": 2424 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9487944441727393, "frac_reward_zero_std": 0.0, "grad_norm": 0.6620161854030221, "kl": 0.389404296875, "learning_rate": 1.621007830989496e-07, "loss": 0.0039, "num_tokens": 1118095810.0, "reward": 0.0465087890625, "reward_std": 0.0293942391872406, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23844105005264282, "step": 2425 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9491856996136353, "frac_reward_zero_std": 0.0, "grad_norm": 6.864819619739945, "kl": 0.421142578125, "learning_rate": 1.596605458538769e-07, "loss": 0.0042, "num_tokens": 1118650066.0, "reward": 0.0452880859375, "reward_std": 0.028604278340935707, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.2308020144701004, "step": 2426 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9495769550545312, "frac_reward_zero_std": 0.0, "grad_norm": 0.7421993234418827, "kl": 0.408935546875, "learning_rate": 1.5723866782288545e-07, "loss": 0.0041, "num_tokens": 1119201730.0, "reward": 0.0477294921875, "reward_std": 0.031122200191020966, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.2503960430622101, "step": 2427 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9499682104954272, "frac_reward_zero_std": 0.0, "grad_norm": 0.7663573840029582, "kl": 0.39990234375, "learning_rate": 1.5483515352449518e-07, "loss": 0.004, "num_tokens": 1119756754.0, "reward": 0.049072265625, "reward_std": 0.02687935344874859, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.2150777280330658, "step": 2428 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9503594659363231, "frac_reward_zero_std": 0.0, "grad_norm": 0.8480349876771461, "kl": 0.406494140625, "learning_rate": 1.5245000744296357e-07, "loss": 0.0041, "num_tokens": 1120311618.0, "reward": 0.0450439453125, "reward_std": 0.029997404664754868, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.2420911341905594, "step": 2429 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9507507213772192, "frac_reward_zero_std": 0.0, "grad_norm": 0.6806123724876548, "kl": 0.405517578125, "learning_rate": 1.5008323402828318e-07, "loss": 0.0041, "num_tokens": 1120864834.0, "reward": 0.0487060546875, "reward_std": 0.02788897231221199, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3896484375, "rewards/tag_count_reward/std": 0.22531084716320038, "step": 2430 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9511419768181151, "frac_reward_zero_std": 0.0, "grad_norm": 0.7202796097008833, "kl": 0.40771484375, "learning_rate": 1.4773483769616403e-07, "loss": 0.0041, "num_tokens": 1121420754.0, "reward": 0.0489501953125, "reward_std": 0.031279947608709335, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3916015625, "rewards/tag_count_reward/std": 0.25382906198501587, "step": 2431 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9515332322590111, "frac_reward_zero_std": 0.0, "grad_norm": 0.7522515210564832, "kl": 0.404296875, "learning_rate": 1.4540482282803136e-07, "loss": 0.004, "num_tokens": 1121975282.0, "reward": 0.0462646484375, "reward_std": 0.02989840693771839, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24649621546268463, "step": 2432 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9519244876999071, "frac_reward_zero_std": 0.0, "grad_norm": 0.8413093520038487, "kl": 0.39892578125, "learning_rate": 1.430931937710156e-07, "loss": 0.004, "num_tokens": 1122528642.0, "reward": 0.04736328125, "reward_std": 0.03127213567495346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2543433904647827, "step": 2433 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.952315743140803, "frac_reward_zero_std": 0.0, "grad_norm": 0.6895952901633368, "kl": 0.4052734375, "learning_rate": 1.4079995483794572e-07, "loss": 0.0041, "num_tokens": 1123084114.0, "reward": 0.0452880859375, "reward_std": 0.030288025736808777, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.24220183491706848, "step": 2434 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.952706998581699, "frac_reward_zero_std": 0.0, "grad_norm": 0.6752586643933098, "kl": 0.40380859375, "learning_rate": 1.3852511030733818e-07, "loss": 0.004, "num_tokens": 1123636658.0, "reward": 0.0460205078125, "reward_std": 0.03132614493370056, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.2503960430622101, "step": 2435 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.953098254022595, "frac_reward_zero_std": 0.0, "grad_norm": 1.8679903768478332, "kl": 0.403076171875, "learning_rate": 1.3626866442339237e-07, "loss": 0.004, "num_tokens": 1124190322.0, "reward": 0.0465087890625, "reward_std": 0.028523951768875122, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23113363981246948, "step": 2436 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.953489509463491, "frac_reward_zero_std": 0.0, "grad_norm": 0.6987915278564201, "kl": 0.40478515625, "learning_rate": 1.3403062139598078e-07, "loss": 0.004, "num_tokens": 1124745074.0, "reward": 0.0435791015625, "reward_std": 0.03083428181707859, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.2510528564453125, "step": 2437 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.953880764904387, "frac_reward_zero_std": 0.0, "grad_norm": 0.5945859711956146, "kl": 0.401611328125, "learning_rate": 1.3181098540064107e-07, "loss": 0.004, "num_tokens": 1125299346.0, "reward": 0.049560546875, "reward_std": 0.03031064197421074, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.24659912288188934, "step": 2438 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9542720203452829, "frac_reward_zero_std": 0.0, "grad_norm": 0.7249108680728207, "kl": 0.412353515625, "learning_rate": 1.2960976057856843e-07, "loss": 0.0041, "num_tokens": 1125852242.0, "reward": 0.04443359375, "reward_std": 0.030579835176467896, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.35546875, "rewards/tag_count_reward/std": 0.24874070286750793, "step": 2439 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9546632757861789, "frac_reward_zero_std": 0.0, "grad_norm": 0.739178585318592, "kl": 0.408447265625, "learning_rate": 1.2742695103660996e-07, "loss": 0.0041, "num_tokens": 1126407218.0, "reward": 0.0440673828125, "reward_std": 0.028650913387537003, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3525390625, "rewards/tag_count_reward/std": 0.2332284152507782, "step": 2440 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9550545312270748, "frac_reward_zero_std": 0.0, "grad_norm": 0.8128896061180874, "kl": 0.404541015625, "learning_rate": 1.2526256084725351e-07, "loss": 0.0041, "num_tokens": 1126959986.0, "reward": 0.0462646484375, "reward_std": 0.03061518631875515, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24649621546268463, "step": 2441 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9554457866679709, "frac_reward_zero_std": 0.0, "grad_norm": 0.7244219978911105, "kl": 0.40087890625, "learning_rate": 1.231165940486234e-07, "loss": 0.004, "num_tokens": 1127515986.0, "reward": 0.0478515625, "reward_std": 0.03152455762028694, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.24947863817214966, "step": 2442 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9558370421088668, "frac_reward_zero_std": 0.0, "grad_norm": 0.7032550558586158, "kl": 0.40625, "learning_rate": 1.2098905464446807e-07, "loss": 0.0041, "num_tokens": 1128069506.0, "reward": 0.04345703125, "reward_std": 0.02876053936779499, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34765625, "rewards/tag_count_reward/std": 0.2348015010356903, "step": 2443 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9562282975497628, "frac_reward_zero_std": 0.0, "grad_norm": 0.8211128448684583, "kl": 0.407470703125, "learning_rate": 1.1887994660415903e-07, "loss": 0.0041, "num_tokens": 1128624594.0, "reward": 0.049072265625, "reward_std": 0.029545266181230545, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.23986025154590607, "step": 2444 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9566195529906588, "frac_reward_zero_std": 0.0, "grad_norm": 0.9486834983987256, "kl": 0.414306640625, "learning_rate": 1.1678927386267746e-07, "loss": 0.0041, "num_tokens": 1129176306.0, "reward": 0.0479736328125, "reward_std": 0.030761368572711945, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.24935387074947357, "step": 2445 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9570108084315547, "frac_reward_zero_std": 0.0, "grad_norm": 0.8902498634932862, "kl": 0.399169921875, "learning_rate": 1.1471704032061325e-07, "loss": 0.004, "num_tokens": 1129730466.0, "reward": 0.0479736328125, "reward_std": 0.029246071353554726, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.24136236310005188, "step": 2446 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9574020638724507, "frac_reward_zero_std": 0.0, "grad_norm": 0.8467878194207188, "kl": 0.406005859375, "learning_rate": 1.1266324984415266e-07, "loss": 0.0041, "num_tokens": 1130286786.0, "reward": 0.0487060546875, "reward_std": 0.02779790386557579, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3896484375, "rewards/tag_count_reward/std": 0.2274760901927948, "step": 2447 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9577933193133467, "frac_reward_zero_std": 0.0, "grad_norm": 0.7246842589123917, "kl": 0.404541015625, "learning_rate": 1.1062790626506948e-07, "loss": 0.004, "num_tokens": 1130839442.0, "reward": 0.0462646484375, "reward_std": 0.030888183042407036, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.24748854339122772, "step": 2448 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9581845747542427, "frac_reward_zero_std": 0.0, "grad_norm": 0.8760092968636175, "kl": 0.41015625, "learning_rate": 1.0861101338072499e-07, "loss": 0.0041, "num_tokens": 1131393618.0, "reward": 0.0499267578125, "reward_std": 0.03002762421965599, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3994140625, "rewards/tag_count_reward/std": 0.24028098583221436, "step": 2449 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9585758301951387, "frac_reward_zero_std": 0.0, "grad_norm": 0.8365741150006482, "kl": 0.406982421875, "learning_rate": 1.0661257495405363e-07, "loss": 0.0041, "num_tokens": 1131945346.0, "reward": 0.0447998046875, "reward_std": 0.028816327452659607, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3583984375, "rewards/tag_count_reward/std": 0.23580853641033173, "step": 2450 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9589670856360346, "frac_reward_zero_std": 0.0, "grad_norm": 0.8060379313743082, "kl": 0.40234375, "learning_rate": 1.0463259471356291e-07, "loss": 0.004, "num_tokens": 1132498274.0, "reward": 0.0478515625, "reward_std": 0.030070699751377106, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.24139606952667236, "step": 2451 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9593583410769306, "frac_reward_zero_std": 0.0, "grad_norm": 0.7612263338493225, "kl": 0.411376953125, "learning_rate": 1.0267107635331897e-07, "loss": 0.0041, "num_tokens": 1133052002.0, "reward": 0.0445556640625, "reward_std": 0.03175465390086174, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.25752872228622437, "step": 2452 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9597495965178265, "frac_reward_zero_std": 0.0, "grad_norm": 2.679379042162433, "kl": 0.418701171875, "learning_rate": 1.0072802353294664e-07, "loss": 0.0042, "num_tokens": 1133606034.0, "reward": 0.0458984375, "reward_std": 0.03167249634861946, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.2462649941444397, "step": 2453 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9601408519587226, "frac_reward_zero_std": 0.0, "grad_norm": 0.6929192668631314, "kl": 0.405029296875, "learning_rate": 9.880343987761721e-08, "loss": 0.004, "num_tokens": 1134158242.0, "reward": 0.0478515625, "reward_std": 0.028060192242264748, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.2320782095193863, "step": 2454 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9605321073996185, "frac_reward_zero_std": 0.0, "grad_norm": 0.7221613127286813, "kl": 0.415283203125, "learning_rate": 9.689732897804394e-08, "loss": 0.0042, "num_tokens": 1134714450.0, "reward": 0.043701171875, "reward_std": 0.028574874624609947, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.22867874801158905, "step": 2455 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9609233628405145, "frac_reward_zero_std": 0.0, "grad_norm": 0.6996153902410629, "kl": 0.3994140625, "learning_rate": 9.500969439047658e-08, "loss": 0.004, "num_tokens": 1135268994.0, "reward": 0.045654296875, "reward_std": 0.029342330992221832, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.23619189858436584, "step": 2456 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9613146182814105, "frac_reward_zero_std": 0.0, "grad_norm": 0.9249715281659862, "kl": 0.406982421875, "learning_rate": 9.314053963669245e-08, "loss": 0.0041, "num_tokens": 1135821714.0, "reward": 0.047607421875, "reward_std": 0.031199846416711807, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.376953125, "rewards/tag_count_reward/std": 0.24752916395664215, "step": 2457 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9617058737223064, "frac_reward_zero_std": 0.0, "grad_norm": 0.8444642400436525, "kl": 0.407470703125, "learning_rate": 9.128986820399199e-08, "loss": 0.0041, "num_tokens": 1136374898.0, "reward": 0.0472412109375, "reward_std": 0.029286038130521774, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.23946675658226013, "step": 2458 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9620971291632024, "frac_reward_zero_std": 0.0, "grad_norm": 0.7040929440879635, "kl": 0.4248046875, "learning_rate": 8.945768354518881e-08, "loss": 0.0043, "num_tokens": 1136927746.0, "reward": 0.0479736328125, "reward_std": 0.03270849585533142, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.25239166617393494, "step": 2459 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9624883846040984, "frac_reward_zero_std": 0.0, "grad_norm": 0.699480998664822, "kl": 0.402587890625, "learning_rate": 8.76439890786085e-08, "loss": 0.004, "num_tokens": 1137481106.0, "reward": 0.0478515625, "reward_std": 0.0321013480424881, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.25521522760391235, "step": 2460 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9628796400449944, "frac_reward_zero_std": 0.0, "grad_norm": 0.6237943229877253, "kl": 0.397216796875, "learning_rate": 8.584878818807652e-08, "loss": 0.004, "num_tokens": 1138034706.0, "reward": 0.0467529296875, "reward_std": 0.027954693883657455, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.21687528491020203, "step": 2461 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9632708954858904, "frac_reward_zero_std": 0.0, "grad_norm": 0.7758369264514876, "kl": 0.39208984375, "learning_rate": 8.407208422291702e-08, "loss": 0.0039, "num_tokens": 1138589010.0, "reward": 0.046875, "reward_std": 0.03202967345714569, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.24750594794750214, "step": 2462 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9636621509267863, "frac_reward_zero_std": 0.0, "grad_norm": 0.5760205101621824, "kl": 0.40283203125, "learning_rate": 8.231388049794398e-08, "loss": 0.004, "num_tokens": 1139141938.0, "reward": 0.0443115234375, "reward_std": 0.029366083443164825, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3544921875, "rewards/tag_count_reward/std": 0.23860159516334534, "step": 2463 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9640534063676823, "frac_reward_zero_std": 0.0, "grad_norm": 0.699716517239473, "kl": 0.396728515625, "learning_rate": 8.057418029345455e-08, "loss": 0.004, "num_tokens": 1139696338.0, "reward": 0.0494384765625, "reward_std": 0.029605118557810783, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3955078125, "rewards/tag_count_reward/std": 0.23962663114070892, "step": 2464 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9644446618085782, "frac_reward_zero_std": 0.0, "grad_norm": 0.7468179869271089, "kl": 0.401123046875, "learning_rate": 7.885298685522235e-08, "loss": 0.004, "num_tokens": 1140255330.0, "reward": 0.0472412109375, "reward_std": 0.029106464236974716, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.2465272843837738, "step": 2465 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9648359172494743, "frac_reward_zero_std": 0.0, "grad_norm": 0.8881800890629119, "kl": 0.401123046875, "learning_rate": 7.715030339449536e-08, "loss": 0.004, "num_tokens": 1140809106.0, "reward": 0.045654296875, "reward_std": 0.03143472597002983, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.24616388976573944, "step": 2466 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9652271726903702, "frac_reward_zero_std": 0.0, "grad_norm": 0.6995346487144606, "kl": 0.400634765625, "learning_rate": 7.546613308798468e-08, "loss": 0.004, "num_tokens": 1141363890.0, "reward": 0.0487060546875, "reward_std": 0.027985449880361557, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3896484375, "rewards/tag_count_reward/std": 0.22531084716320038, "step": 2467 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9656184281312662, "frac_reward_zero_std": 0.0, "grad_norm": 0.8216702000277031, "kl": 0.395751953125, "learning_rate": 7.380047907786458e-08, "loss": 0.004, "num_tokens": 1141918274.0, "reward": 0.0467529296875, "reward_std": 0.030884908512234688, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2504878044128418, "step": 2468 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9660096835721622, "frac_reward_zero_std": 0.0, "grad_norm": 0.6885107944525496, "kl": 0.393798828125, "learning_rate": 7.215334447176147e-08, "loss": 0.0039, "num_tokens": 1142473666.0, "reward": 0.049072265625, "reward_std": 0.03180921822786331, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.25664424896240234, "step": 2469 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9664009390130581, "frac_reward_zero_std": 0.0, "grad_norm": 0.7578534219672975, "kl": 0.391357421875, "learning_rate": 7.052473234274825e-08, "loss": 0.0039, "num_tokens": 1143029778.0, "reward": 0.04296875, "reward_std": 0.03184754401445389, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.34375, "rewards/tag_count_reward/std": 0.2553352415561676, "step": 2470 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9667921944539541, "frac_reward_zero_std": 0.0, "grad_norm": 0.7061515517855845, "kl": 0.4033203125, "learning_rate": 6.891464572934436e-08, "loss": 0.004, "num_tokens": 1143582642.0, "reward": 0.048583984375, "reward_std": 0.03216620534658432, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.388671875, "rewards/tag_count_reward/std": 0.256882905960083, "step": 2471 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9671834498948501, "frac_reward_zero_std": 0.0, "grad_norm": 0.7204864927971899, "kl": 0.394287109375, "learning_rate": 6.732308763550022e-08, "loss": 0.0039, "num_tokens": 1144137970.0, "reward": 0.047607421875, "reward_std": 0.0295268427580595, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2383868396282196, "step": 2472 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9675747053357461, "frac_reward_zero_std": 0.0, "grad_norm": 0.8201252742627514, "kl": 0.40478515625, "learning_rate": 6.575006103060388e-08, "loss": 0.0041, "num_tokens": 1144691218.0, "reward": 0.0467529296875, "reward_std": 0.028961043804883957, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3740234375, "rewards/tag_count_reward/std": 0.2374270260334015, "step": 2473 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9679659607766421, "frac_reward_zero_std": 0.0, "grad_norm": 0.901366084196751, "kl": 0.3974609375, "learning_rate": 6.419556884946443e-08, "loss": 0.004, "num_tokens": 1145245314.0, "reward": 0.0460205078125, "reward_std": 0.030795065686106682, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.2513729929924011, "step": 2474 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.968357216217538, "frac_reward_zero_std": 0.0, "grad_norm": 0.732450625286649, "kl": 0.407958984375, "learning_rate": 6.26596139923119e-08, "loss": 0.0041, "num_tokens": 1145799682.0, "reward": 0.047607421875, "reward_std": 0.03019523248076439, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.24547842144966125, "step": 2475 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.968748471658434, "frac_reward_zero_std": 0.0, "grad_norm": 0.7331987450342938, "kl": 0.412841796875, "learning_rate": 6.114219932479404e-08, "loss": 0.0041, "num_tokens": 1146350786.0, "reward": 0.0435791015625, "reward_std": 0.027979880571365356, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3447265625, "rewards/tag_count_reward/std": 0.21931606531143188, "step": 2476 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9691397270993299, "frac_reward_zero_std": 0.0, "grad_norm": 0.7576984070055054, "kl": 0.39404296875, "learning_rate": 5.964332767796399e-08, "loss": 0.0039, "num_tokens": 1146904386.0, "reward": 0.0477294921875, "reward_std": 0.029177196323871613, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.23525570333003998, "step": 2477 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.969530982540226, "frac_reward_zero_std": 0.0, "grad_norm": 0.625694241666361, "kl": 0.400634765625, "learning_rate": 5.816300184828039e-08, "loss": 0.004, "num_tokens": 1147458882.0, "reward": 0.0491943359375, "reward_std": 0.030951354652643204, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3935546875, "rewards/tag_count_reward/std": 0.2546575367450714, "step": 2478 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9699222379811219, "frac_reward_zero_std": 0.0, "grad_norm": 0.7196775386260977, "kl": 0.393798828125, "learning_rate": 5.670122459760286e-08, "loss": 0.0039, "num_tokens": 1148012498.0, "reward": 0.0445556640625, "reward_std": 0.029859494417905807, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.24080637097358704, "step": 2479 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9703134934220179, "frac_reward_zero_std": 0.0, "grad_norm": 0.6547169746633306, "kl": 0.404296875, "learning_rate": 5.525799865318093e-08, "loss": 0.004, "num_tokens": 1148565698.0, "reward": 0.0482177734375, "reward_std": 0.02921685203909874, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.2382160723209381, "step": 2480 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9707047488629139, "frac_reward_zero_std": 0.0, "grad_norm": 0.752528292974978, "kl": 0.40625, "learning_rate": 5.3833326707656284e-08, "loss": 0.0041, "num_tokens": 1149121154.0, "reward": 0.0482177734375, "reward_std": 0.029977088794112206, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3857421875, "rewards/tag_count_reward/std": 0.24531260132789612, "step": 2481 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9710960043038098, "frac_reward_zero_std": 0.0, "grad_norm": 0.5933595273287702, "kl": 0.39599609375, "learning_rate": 5.2427211419051605e-08, "loss": 0.004, "num_tokens": 1149674898.0, "reward": 0.0458984375, "reward_std": 0.028234899044036865, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.22872060537338257, "step": 2482 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9714872597447058, "frac_reward_zero_std": 0.0, "grad_norm": 0.7146625867268835, "kl": 0.398681640625, "learning_rate": 5.1039655410770605e-08, "loss": 0.004, "num_tokens": 1150228690.0, "reward": 0.0458984375, "reward_std": 0.03319566696882248, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.2607669234275818, "step": 2483 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9718785151856018, "frac_reward_zero_std": 0.0, "grad_norm": 0.6816699176492238, "kl": 0.393798828125, "learning_rate": 4.9670661271589147e-08, "loss": 0.0039, "num_tokens": 1150782338.0, "reward": 0.0478515625, "reward_std": 0.02898353710770607, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.23626485466957092, "step": 2484 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9722697706264978, "frac_reward_zero_std": 0.0, "grad_norm": 0.881926852811545, "kl": 0.4013671875, "learning_rate": 4.832023155565302e-08, "loss": 0.004, "num_tokens": 1151337058.0, "reward": 0.044921875, "reward_std": 0.031266842037439346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.2529240846633911, "step": 2485 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9726610260673938, "frac_reward_zero_std": 0.0, "grad_norm": 1.0185731935674416, "kl": 0.400390625, "learning_rate": 4.698836878247015e-08, "loss": 0.004, "num_tokens": 1151891890.0, "reward": 0.0511474609375, "reward_std": 0.04474629461765289, "rewards/accuracy_reward/mean": 0.00390625, "rewards/accuracy_reward/std": 0.0625, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3779296875, "rewards/tag_count_reward/std": 0.24850773811340332, "step": 2486 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9730522815082897, "frac_reward_zero_std": 0.0, "grad_norm": 0.6335296740074434, "kl": 0.398193359375, "learning_rate": 4.567507543691174e-08, "loss": 0.004, "num_tokens": 1152445250.0, "reward": 0.0478515625, "reward_std": 0.02920282632112503, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.23418088257312775, "step": 2487 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9734435369491857, "frac_reward_zero_std": 0.0, "grad_norm": 0.6054274114077252, "kl": 0.400390625, "learning_rate": 4.438035396920004e-08, "loss": 0.004, "num_tokens": 1152998354.0, "reward": 0.046875, "reward_std": 0.031013064086437225, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24950933456420898, "step": 2488 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9738347923900816, "frac_reward_zero_std": 0.0, "grad_norm": 3.6972527310015195, "kl": 0.395263671875, "learning_rate": 4.310420679490945e-08, "loss": 0.004, "num_tokens": 1153552130.0, "reward": 0.0474853515625, "reward_std": 0.02844046987593174, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.23215864598751068, "step": 2489 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9742260478309777, "frac_reward_zero_std": 0.0, "grad_norm": 0.6529382585779665, "kl": 0.3955078125, "learning_rate": 4.1846636294958777e-08, "loss": 0.004, "num_tokens": 1154107362.0, "reward": 0.0458984375, "reward_std": 0.03048306703567505, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3671875, "rewards/tag_count_reward/std": 0.24442310631275177, "step": 2490 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9746173032718736, "frac_reward_zero_std": 0.0, "grad_norm": 0.6794552760890802, "kl": 0.396240234375, "learning_rate": 4.0607644815610084e-08, "loss": 0.004, "num_tokens": 1154660162.0, "reward": 0.047607421875, "reward_std": 0.02839704230427742, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2300145924091339, "step": 2491 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9750085587127696, "frac_reward_zero_std": 0.0, "grad_norm": 34.298456039907016, "kl": 0.406005859375, "learning_rate": 3.938723466846206e-08, "loss": 0.0041, "num_tokens": 1155213442.0, "reward": 0.048095703125, "reward_std": 0.03253360092639923, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2513977587223053, "step": 2492 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9753998141536656, "frac_reward_zero_std": 0.0, "grad_norm": 0.7230518982475316, "kl": 0.408447265625, "learning_rate": 3.818540813044558e-08, "loss": 0.0041, "num_tokens": 1155768578.0, "reward": 0.0501708984375, "reward_std": 0.030285656452178955, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3974609375, "rewards/tag_count_reward/std": 0.24047216773033142, "step": 2493 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9757910695945615, "frac_reward_zero_std": 0.0, "grad_norm": 0.750982746477644, "kl": 0.39794921875, "learning_rate": 3.7002167443818126e-08, "loss": 0.004, "num_tokens": 1156322450.0, "reward": 0.047607421875, "reward_std": 0.030942026525735855, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2513977587223053, "step": 2494 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9761823250354575, "frac_reward_zero_std": 0.0, "grad_norm": 0.7074276459576319, "kl": 0.399169921875, "learning_rate": 3.583751481616382e-08, "loss": 0.004, "num_tokens": 1156879522.0, "reward": 0.048828125, "reward_std": 0.03070131316781044, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.25485482811927795, "step": 2495 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9765735804763535, "frac_reward_zero_std": 0.0, "grad_norm": 0.672818788363612, "kl": 0.39892578125, "learning_rate": 3.469145242038674e-08, "loss": 0.004, "num_tokens": 1157433970.0, "reward": 0.046630859375, "reward_std": 0.02827083319425583, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.22901344299316406, "step": 2496 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9769648359172495, "frac_reward_zero_std": 0.0, "grad_norm": 0.7708281982507573, "kl": 0.399658203125, "learning_rate": 3.356398239470427e-08, "loss": 0.004, "num_tokens": 1157992546.0, "reward": 0.0457763671875, "reward_std": 0.03003501519560814, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24638743698596954, "step": 2497 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9773560913581455, "frac_reward_zero_std": 0.0, "grad_norm": 0.7075365917812784, "kl": 0.40087890625, "learning_rate": 3.245510684265041e-08, "loss": 0.004, "num_tokens": 1158545938.0, "reward": 0.0499267578125, "reward_std": 0.030008826404809952, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3994140625, "rewards/tag_count_reward/std": 0.24028098583221436, "step": 2498 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9777473467990414, "frac_reward_zero_std": 0.0, "grad_norm": 0.6237248027136107, "kl": 0.4052734375, "learning_rate": 3.1364827833063605e-08, "loss": 0.004, "num_tokens": 1159099778.0, "reward": 0.0465087890625, "reward_std": 0.029409101232886314, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3681640625, "rewards/tag_count_reward/std": 0.2310507893562317, "step": 2499 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9781386022399374, "frac_reward_zero_std": 0.0, "grad_norm": 0.7737925110802902, "kl": 0.40185546875, "learning_rate": 3.029314740008671e-08, "loss": 0.004, "num_tokens": 1159654386.0, "reward": 0.045166015625, "reward_std": 0.031484950333833694, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2540043890476227, "step": 2500 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9785298576808333, "frac_reward_zero_std": 0.0, "grad_norm": 0.8464413994867137, "kl": 0.40283203125, "learning_rate": 2.9240067543165883e-08, "loss": 0.004, "num_tokens": 1160206514.0, "reward": 0.047607421875, "reward_std": 0.030686700716614723, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.24447792768478394, "step": 2501 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9789211131217294, "frac_reward_zero_std": 0.0, "grad_norm": 0.6357902276941159, "kl": 0.396484375, "learning_rate": 2.8205590227040613e-08, "loss": 0.004, "num_tokens": 1160758450.0, "reward": 0.0479736328125, "reward_std": 0.02853226102888584, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.2341461330652237, "step": 2502 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9793123685626254, "frac_reward_zero_std": 0.0, "grad_norm": 0.6671563109020142, "kl": 0.40478515625, "learning_rate": 2.7189717381745917e-08, "loss": 0.0041, "num_tokens": 1161312818.0, "reward": 0.0469970703125, "reward_std": 0.030862968415021896, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.2514643967151642, "step": 2503 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9797036240035213, "frac_reward_zero_std": 0.0, "grad_norm": 0.6675047750080475, "kl": 0.400146484375, "learning_rate": 2.6192450902604584e-08, "loss": 0.004, "num_tokens": 1161865842.0, "reward": 0.049072265625, "reward_std": 0.031235555186867714, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.25182393193244934, "step": 2504 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9800948794444173, "frac_reward_zero_std": 0.0, "grad_norm": 0.7291493732201225, "kl": 0.40087890625, "learning_rate": 2.5213792650227165e-08, "loss": 0.004, "num_tokens": 1162420210.0, "reward": 0.046142578125, "reward_std": 0.030399058014154434, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2474672645330429, "step": 2505 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9804861348853132, "frac_reward_zero_std": 0.0, "grad_norm": 0.7822978470080544, "kl": 0.399658203125, "learning_rate": 2.4253744450504212e-08, "loss": 0.004, "num_tokens": 1162975778.0, "reward": 0.048095703125, "reward_std": 0.0292280912399292, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.384765625, "rewards/tag_count_reward/std": 0.23619189858436584, "step": 2506 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9808773903262092, "frac_reward_zero_std": 0.0, "grad_norm": 0.6805489018986463, "kl": 0.41796875, "learning_rate": 2.3312308094607382e-08, "loss": 0.0042, "num_tokens": 1163531986.0, "reward": 0.04736328125, "reward_std": 0.03148827329277992, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24454841017723083, "step": 2507 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9812686457671052, "frac_reward_zero_std": 0.0, "grad_norm": 0.6934830554901249, "kl": 0.40478515625, "learning_rate": 2.2389485338983884e-08, "loss": 0.004, "num_tokens": 1164085666.0, "reward": 0.0462646484375, "reward_std": 0.0305948369204998, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3701171875, "rewards/tag_count_reward/std": 0.2454998642206192, "step": 2508 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9816599012080012, "frac_reward_zero_std": 0.0, "grad_norm": 0.6743799565838836, "kl": 0.39208984375, "learning_rate": 2.1485277905353153e-08, "loss": 0.0039, "num_tokens": 1164640274.0, "reward": 0.0494384765625, "reward_std": 0.030759960412979126, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3916015625, "rewards/tag_count_reward/std": 0.24598297476768494, "step": 2509 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9820511566488972, "frac_reward_zero_std": 0.0, "grad_norm": 0.8494714687626643, "kl": 0.401611328125, "learning_rate": 2.059968748070129e-08, "loss": 0.004, "num_tokens": 1165194578.0, "reward": 0.0465087890625, "reward_std": 0.029276221990585327, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23637627065181732, "step": 2510 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9824424120897931, "frac_reward_zero_std": 0.0, "grad_norm": 0.7470063708687833, "kl": 0.402587890625, "learning_rate": 1.973271571728441e-08, "loss": 0.004, "num_tokens": 1165748194.0, "reward": 0.047607421875, "reward_std": 0.02818656526505947, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2300145924091339, "step": 2511 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9828336675306891, "frac_reward_zero_std": 0.0, "grad_norm": 0.8347533814213768, "kl": 0.39501953125, "learning_rate": 1.8884364232619744e-08, "loss": 0.0039, "num_tokens": 1166302706.0, "reward": 0.0499267578125, "reward_std": 0.03164567053318024, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3955078125, "rewards/tag_count_reward/std": 0.24866178631782532, "step": 2512 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.983224922971585, "frac_reward_zero_std": 0.0, "grad_norm": 0.7278907902892318, "kl": 0.4072265625, "learning_rate": 1.8054634609484534e-08, "loss": 0.0041, "num_tokens": 1166855282.0, "reward": 0.0457763671875, "reward_std": 0.030108120292425156, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24935387074947357, "step": 2513 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9836161784124811, "frac_reward_zero_std": 0.0, "grad_norm": 0.7842254871254339, "kl": 0.404296875, "learning_rate": 1.72435283959127e-08, "loss": 0.004, "num_tokens": 1167409730.0, "reward": 0.04638671875, "reward_std": 0.027022721245884895, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37109375, "rewards/tag_count_reward/std": 0.2202591598033905, "step": 2514 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9840074338533771, "frac_reward_zero_std": 0.0, "grad_norm": 0.9013945011190296, "kl": 0.400146484375, "learning_rate": 1.645104710519374e-08, "loss": 0.004, "num_tokens": 1167962642.0, "reward": 0.044677734375, "reward_std": 0.03025558777153492, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.357421875, "rewards/tag_count_reward/std": 0.24690952897071838, "step": 2515 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.984398689294273, "frac_reward_zero_std": 0.0, "grad_norm": 0.7425876499986462, "kl": 0.3935546875, "learning_rate": 1.5677192215867164e-08, "loss": 0.0039, "num_tokens": 1168517602.0, "reward": 0.049072265625, "reward_std": 0.029578156769275665, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.24087992310523987, "step": 2516 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.984789944735169, "frac_reward_zero_std": 0.0, "grad_norm": 0.6964805075472424, "kl": 0.389404296875, "learning_rate": 1.4921965171720288e-08, "loss": 0.0039, "num_tokens": 1169071426.0, "reward": 0.04931640625, "reward_std": 0.030771564692258835, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.2497241199016571, "step": 2517 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9851812001760649, "frac_reward_zero_std": 0.0, "grad_norm": 0.9918935542283618, "kl": 0.4013671875, "learning_rate": 1.4185367381788218e-08, "loss": 0.004, "num_tokens": 1169623618.0, "reward": 0.046142578125, "reward_std": 0.030304862186312675, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.24647484719753265, "step": 2518 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9855724556169609, "frac_reward_zero_std": 0.0, "grad_norm": 0.7247604590573987, "kl": 0.402587890625, "learning_rate": 1.3467400220348315e-08, "loss": 0.004, "num_tokens": 1170175378.0, "reward": 0.0465087890625, "reward_std": 0.02822837419807911, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23324483633041382, "step": 2519 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9859637110578569, "frac_reward_zero_std": 0.0, "grad_norm": 0.6103817964855976, "kl": 0.396728515625, "learning_rate": 1.2768065026919074e-08, "loss": 0.004, "num_tokens": 1170728834.0, "reward": 0.04736328125, "reward_std": 0.030861705541610718, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2514358460903168, "step": 2520 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9863549664987529, "frac_reward_zero_std": 0.0, "grad_norm": 0.7078814755720515, "kl": 0.398681640625, "learning_rate": 1.20873631062568e-08, "loss": 0.004, "num_tokens": 1171282946.0, "reward": 0.046875, "reward_std": 0.02970602735877037, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24152295291423798, "step": 2521 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9867462219396489, "frac_reward_zero_std": 0.0, "grad_norm": 0.6369442318552662, "kl": 0.39990234375, "learning_rate": 1.142529572835227e-08, "loss": 0.004, "num_tokens": 1171835746.0, "reward": 0.049560546875, "reward_std": 0.03016156516969204, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.396484375, "rewards/tag_count_reward/std": 0.24659912288188934, "step": 2522 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9871374773805448, "frac_reward_zero_std": 0.0, "grad_norm": 0.7946249938461899, "kl": 0.398193359375, "learning_rate": 1.0781864128431852e-08, "loss": 0.004, "num_tokens": 1172390802.0, "reward": 0.0465087890625, "reward_std": 0.03236508369445801, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.2600739300251007, "step": 2523 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9875287328214408, "frac_reward_zero_std": 0.0, "grad_norm": 0.722850243842884, "kl": 0.39306640625, "learning_rate": 1.0157069506950834e-08, "loss": 0.0039, "num_tokens": 1172944962.0, "reward": 0.0469970703125, "reward_std": 0.030432287603616714, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.24950741231441498, "step": 2524 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9879199882623367, "frac_reward_zero_std": 0.0, "grad_norm": 0.8116079354779682, "kl": 0.448974609375, "learning_rate": 9.550913029596765e-09, "loss": 0.0045, "num_tokens": 1173498066.0, "reward": 0.0479736328125, "reward_std": 0.030003877356648445, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3837890625, "rewards/tag_count_reward/std": 0.24136236310005188, "step": 2525 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9883112437032328, "frac_reward_zero_std": 0.0, "grad_norm": 0.7247905365712529, "kl": 0.393310546875, "learning_rate": 8.963395827278343e-09, "loss": 0.0039, "num_tokens": 1174051394.0, "reward": 0.045654296875, "reward_std": 0.02985377050936222, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.365234375, "rewards/tag_count_reward/std": 0.24030688405036926, "step": 2526 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9887024991441288, "frac_reward_zero_std": 0.0, "grad_norm": 0.702834908375876, "kl": 0.404052734375, "learning_rate": 8.394518996135414e-09, "loss": 0.004, "num_tokens": 1174606418.0, "reward": 0.046630859375, "reward_std": 0.02999039739370346, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.373046875, "rewards/tag_count_reward/std": 0.24454058706760406, "step": 2527 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9890937545850247, "frac_reward_zero_std": 0.0, "grad_norm": 0.9882598114948542, "kl": 0.404296875, "learning_rate": 7.844283597523428e-09, "loss": 0.004, "num_tokens": 1175158834.0, "reward": 0.04248046875, "reward_std": 0.027424346655607224, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.33984375, "rewards/tag_count_reward/std": 0.22192203998565674, "step": 2528 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9894850100259207, "frac_reward_zero_std": 0.0, "grad_norm": 0.6411021503618761, "kl": 0.397705078125, "learning_rate": 7.312690658024535e-09, "loss": 0.004, "num_tokens": 1175712194.0, "reward": 0.043701171875, "reward_std": 0.02960643544793129, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.349609375, "rewards/tag_count_reward/std": 0.236062154173851, "step": 2529 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9898762654668166, "frac_reward_zero_std": 0.0, "grad_norm": 0.805941419805365, "kl": 0.4150390625, "learning_rate": 6.799741169435381e-09, "loss": 0.0042, "num_tokens": 1176266002.0, "reward": 0.0479736328125, "reward_std": 0.031563032418489456, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.24649621546268463, "step": 2530 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9902675209077126, "frac_reward_zero_std": 0.0, "grad_norm": 0.7322659709457177, "kl": 0.39599609375, "learning_rate": 6.3054360887704335e-09, "loss": 0.004, "num_tokens": 1176820642.0, "reward": 0.0457763671875, "reward_std": 0.029846934601664543, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3662109375, "rewards/tag_count_reward/std": 0.24034473299980164, "step": 2531 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9906587763486085, "frac_reward_zero_std": 0.0, "grad_norm": 0.668272161982815, "kl": 0.3955078125, "learning_rate": 5.8297763382597625e-09, "loss": 0.004, "num_tokens": 1177375618.0, "reward": 0.0455322265625, "reward_std": 0.029268423095345497, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3642578125, "rewards/tag_count_reward/std": 0.23301485180854797, "step": 2532 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9910500317895046, "frac_reward_zero_std": 0.0, "grad_norm": 0.6571033061810981, "kl": 0.397216796875, "learning_rate": 5.37276280534682e-09, "loss": 0.004, "num_tokens": 1177928514.0, "reward": 0.04541015625, "reward_std": 0.03113570250570774, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.36328125, "rewards/tag_count_reward/std": 0.25216588377952576, "step": 2533 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9914412872304006, "frac_reward_zero_std": 0.0, "grad_norm": 0.6312713560425508, "kl": 0.399658203125, "learning_rate": 4.9343963426840006e-09, "loss": 0.004, "num_tokens": 1178482210.0, "reward": 0.049072265625, "reward_std": 0.02999797835946083, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.392578125, "rewards/tag_count_reward/std": 0.24591486155986786, "step": 2534 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9918325426712965, "frac_reward_zero_std": 0.0, "grad_norm": 0.8199253372455003, "kl": 0.399658203125, "learning_rate": 4.5146777681381874e-09, "loss": 0.004, "num_tokens": 1179034882.0, "reward": 0.0496826171875, "reward_std": 0.029817089438438416, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3935546875, "rewards/tag_count_reward/std": 0.2387620508670807, "step": 2535 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9922237981121925, "frac_reward_zero_std": 0.0, "grad_norm": 0.595469727574871, "kl": 0.395751953125, "learning_rate": 4.113607864781877e-09, "loss": 0.004, "num_tokens": 1179588946.0, "reward": 0.0469970703125, "reward_std": 0.029302509501576424, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3759765625, "rewards/tag_count_reward/std": 0.24454645812511444, "step": 2536 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9926150535530884, "frac_reward_zero_std": 0.0, "grad_norm": 0.7551232540523113, "kl": 0.401611328125, "learning_rate": 3.731187380893176e-09, "loss": 0.004, "num_tokens": 1180143794.0, "reward": 0.04931640625, "reward_std": 0.030379055067896843, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.39453125, "rewards/tag_count_reward/std": 0.24476754665374756, "step": 2537 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9930063089939845, "frac_reward_zero_std": 0.0, "grad_norm": 0.7244421237729363, "kl": 0.391845703125, "learning_rate": 3.3674170299602405e-09, "loss": 0.0039, "num_tokens": 1180699026.0, "reward": 0.044921875, "reward_std": 0.02923421375453472, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.359375, "rewards/tag_count_reward/std": 0.2389724850654602, "step": 2538 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9933975644348805, "frac_reward_zero_std": 0.0, "grad_norm": 0.7343601619414819, "kl": 0.403564453125, "learning_rate": 3.0222974906701784e-09, "loss": 0.004, "num_tokens": 1181253762.0, "reward": 0.0465087890625, "reward_std": 0.03125360608100891, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.24949206411838531, "step": 2539 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9937888198757764, "frac_reward_zero_std": 0.0, "grad_norm": 0.6773936710865862, "kl": 0.395263671875, "learning_rate": 2.695829406917927e-09, "loss": 0.004, "num_tokens": 1181807266.0, "reward": 0.0478515625, "reward_std": 0.027944426983594894, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3828125, "rewards/tag_count_reward/std": 0.22673621773719788, "step": 2540 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9941800753166724, "frac_reward_zero_std": 0.0, "grad_norm": 1.0257674175280893, "kl": 0.4033203125, "learning_rate": 2.3880133877962617e-09, "loss": 0.004, "num_tokens": 1182362018.0, "reward": 0.048828125, "reward_std": 0.02979425899684429, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.390625, "rewards/tag_count_reward/std": 0.23587551712989807, "step": 2541 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9945713307575683, "frac_reward_zero_std": 0.0, "grad_norm": 0.7455725349225321, "kl": 0.39990234375, "learning_rate": 2.0988500076013494e-09, "loss": 0.004, "num_tokens": 1182913746.0, "reward": 0.045166015625, "reward_std": 0.02790915220975876, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.361328125, "rewards/tag_count_reward/std": 0.2264573723077774, "step": 2542 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9949625861984643, "frac_reward_zero_std": 0.0, "grad_norm": 0.7282733462859541, "kl": 0.40185546875, "learning_rate": 1.8283398058283053e-09, "loss": 0.004, "num_tokens": 1183468482.0, "reward": 0.047607421875, "reward_std": 0.028502795845270157, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.380859375, "rewards/tag_count_reward/std": 0.2300145924091339, "step": 2543 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9953538416393602, "frac_reward_zero_std": 0.0, "grad_norm": 0.9340817049915129, "kl": 0.40283203125, "learning_rate": 1.576483287170083e-09, "loss": 0.004, "num_tokens": 1184020610.0, "reward": 0.0445556640625, "reward_std": 0.030494466423988342, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3564453125, "rewards/tag_count_reward/std": 0.24484382569789886, "step": 2544 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9957450970802563, "frac_reward_zero_std": 0.0, "grad_norm": 0.685199874749508, "kl": 0.390380859375, "learning_rate": 1.343280921518586e-09, "loss": 0.0039, "num_tokens": 1184574770.0, "reward": 0.051513671875, "reward_std": 0.02972475253045559, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.412109375, "rewards/tag_count_reward/std": 0.24068905413150787, "step": 2545 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9961363525211523, "frac_reward_zero_std": 0.0, "grad_norm": 0.7571587921545332, "kl": 0.39990234375, "learning_rate": 1.1287331439602256e-09, "loss": 0.004, "num_tokens": 1185127890.0, "reward": 0.0450439453125, "reward_std": 0.029483258724212646, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3603515625, "rewards/tag_count_reward/std": 0.24107658863067627, "step": 2546 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9965276079620482, "frac_reward_zero_std": 0.0, "grad_norm": 0.6473739953003306, "kl": 0.397705078125, "learning_rate": 9.328403547792518e-10, "loss": 0.004, "num_tokens": 1185682146.0, "reward": 0.046142578125, "reward_std": 0.028192784637212753, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.369140625, "rewards/tag_count_reward/std": 0.2289465367794037, "step": 2547 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9969188634029442, "frac_reward_zero_std": 0.0, "grad_norm": 0.6934385052871239, "kl": 0.4033203125, "learning_rate": 7.556029194566439e-10, "loss": 0.004, "num_tokens": 1186235362.0, "reward": 0.0477294921875, "reward_std": 0.03010508045554161, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3818359375, "rewards/tag_count_reward/std": 0.24545307457447052, "step": 2548 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9973101188438401, "frac_reward_zero_std": 0.0, "grad_norm": 0.7659179769415606, "kl": 0.395263671875, "learning_rate": 5.970211686623373e-10, "loss": 0.004, "num_tokens": 1186790626.0, "reward": 0.0435791015625, "reward_std": 0.027621187269687653, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3486328125, "rewards/tag_count_reward/std": 0.22532784938812256, "step": 2549 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9977013742847362, "frac_reward_zero_std": 0.0, "grad_norm": 0.9073199791454238, "kl": 0.40185546875, "learning_rate": 4.570953982674375e-10, "loss": 0.004, "num_tokens": 1187344578.0, "reward": 0.0465087890625, "reward_std": 0.031158607453107834, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.2514491677284241, "step": 2550 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9980926297256322, "frac_reward_zero_std": 0.0, "grad_norm": 0.8457059942871437, "kl": 0.39794921875, "learning_rate": 3.3582586932978665e-10, "loss": 0.004, "num_tokens": 1187897346.0, "reward": 0.04736328125, "reward_std": 0.031195178627967834, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.375, "rewards/tag_count_reward/std": 0.24454841017723083, "step": 2551 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9984838851665281, "frac_reward_zero_std": 0.0, "grad_norm": 0.8524407045939009, "kl": 0.398193359375, "learning_rate": 2.3321280810617575e-10, "loss": 0.004, "num_tokens": 1188451410.0, "reward": 0.04736328125, "reward_std": 0.03010387532413006, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.37890625, "rewards/tag_count_reward/std": 0.2404741644859314, "step": 2552 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.9988751406074241, "frac_reward_zero_std": 0.0, "grad_norm": 0.6608043290566652, "kl": 0.3974609375, "learning_rate": 1.4925640603902226e-10, "loss": 0.004, "num_tokens": 1189005682.0, "reward": 0.0465087890625, "reward_std": 0.029519639909267426, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3720703125, "rewards/tag_count_reward/std": 0.23637627065181732, "step": 2553 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.99926639604832, "frac_reward_zero_std": 0.0, "grad_norm": 0.6662530079264714, "kl": 0.39453125, "learning_rate": 8.395681976969272e-11, "loss": 0.0039, "num_tokens": 1189561570.0, "reward": 0.0474853515625, "reward_std": 0.028921928256750107, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.0, "rewards/format_reward/std": 0.0, "rewards/tag_count_reward/mean": 0.3798828125, "rewards/tag_count_reward/std": 0.2363438606262207, "step": 2554 }, { "clip_ratio/high_max": 0.0, "clip_ratio/high_mean": 0.0, "clip_ratio/low_mean": 0.0, "clip_ratio/low_min": 0.0, "clip_ratio/region_mean": 0.0, "completions/clipped_ratio": 1.0, "completions/max_length": 2048.0, "completions/max_terminated_length": 0.0, "completions/mean_length": 2048.0, "completions/mean_terminated_length": 0.0, "completions/min_length": 2048.0, "completions/min_terminated_length": 0.0, "epoch": 0.999657651489216, "frac_reward_zero_std": 0.0, "grad_norm": 0.6117556992762061, "kl": 0.405517578125, "learning_rate": 3.7314171127400544e-11, "loss": 0.0041, "num_tokens": 1190116322.0, "reward": 0.0457763671875, "reward_std": 0.030130110681056976, "rewards/accuracy_reward/mean": 0.0, "rewards/accuracy_reward/std": 0.0, "rewards/format_reward/mean": 0.00390625, "rewards/format_reward/std": 0.0625, "rewards/tag_count_reward/mean": 0.3623046875, "rewards/tag_count_reward/std": 0.23708805441856384, "step": 2555 }, { "epoch": 0.999657651489216, "step": 2555, "total_flos": 0.0, "train_loss": 12868.4468021695, "train_runtime": 382532.2881, "train_samples_per_second": 0.107, "train_steps_per_second": 0.007 } ], "logging_steps": 1, "max_steps": 2556, "num_input_tokens_seen": 1190116322, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 16, "trial_name": null, "trial_params": null }